From 3a021340195a3e77c4855afb1b0ccef23c3c503b Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 16:09:11 +0300 Subject: [PATCH 01/17] working cuda vulkan interop example --- 63.CUDAInterop/CMakeLists.txt | 17 ++++ 63.CUDAInterop/main.cpp | 131 +++++++++++++++++++++++++++++ 63.CUDAInterop/pipeline.groovy | 50 +++++++++++ 63.CUDAInterop/vectorAdd_kernel.cu | 42 +++++++++ CMakeLists.txt | 1 + 5 files changed, 241 insertions(+) create mode 100644 63.CUDAInterop/CMakeLists.txt create mode 100644 63.CUDAInterop/main.cpp create mode 100644 63.CUDAInterop/pipeline.groovy create mode 100644 63.CUDAInterop/vectorAdd_kernel.cu diff --git a/63.CUDAInterop/CMakeLists.txt b/63.CUDAInterop/CMakeLists.txt new file mode 100644 index 000000000..2f8f8439f --- /dev/null +++ b/63.CUDAInterop/CMakeLists.txt @@ -0,0 +1,17 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +set(CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} +) + +nbl_create_executable_project( + "" + "" + "${CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS}" + "" + "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}" +) \ No newline at end of file diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp new file mode 100644 index 000000000..a1c38629b --- /dev/null +++ b/63.CUDAInterop/main.cpp @@ -0,0 +1,131 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#define _NBL_STATIC_LIB_ +#include + +#include "nbl/video/CCUDAHandler.h" + +#include "../common/CommonAPI.h" + +/** +This example just shows a screen which clears to red, +nothing fancy, just to show that Irrlicht links fine +**/ +using namespace nbl; + + +/* +The start of the main function starts like in most other example. We ask the +user for the desired renderer and start it up. +*/ + +#define ASSERT_SUCCESS(expr) \ +if (auto re = expr; CUDA_SUCCESS != re) { \ + const char* name = 0, *str = 0; \ + cu.pcuGetErrorName(re, &name); \ + cu.pcuGetErrorString(re, &str); \ + printf("%s:%d %s:\n\t%s\n", __FILE__, __LINE__, name, str); \ + abort(); \ +} + +#define ASSERT_SUCCESS_NV(expr) \ +if (auto re = expr; NVRTC_SUCCESS != re) { \ + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); \ + printf("%s:%d %s\n", __FILE__, __LINE__, str); \ + abort(); \ +} + + +int main(int argc, char** argv) +{ + auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ + .appName = { "63.CUDAInterop" }, + .apiType = video::EAT_VULKAN, + .swapchainImageUsage = nbl::asset::IImage::EUF_NONE, + }); + + auto& system = initOutput.system; + auto& apiConnection = initOutput.apiConnection; + auto& physicalDevice = initOutput.physicalDevice; + auto& logicalDevice = initOutput.logicalDevice; + auto& utilities = initOutput.utilities; + auto& queues = initOutput.queues; + auto& logger = initOutput.logger; + + assert(physicalDevice->getLimits().externalMemory); + auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); + assert(cudaHandler); + auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); + + auto& cu = cudaHandler->getCUDAFunctionTable(); + core::smart_refctd_ptr ptx; + CUmodule module; + CUfunction kernel; + CUstream stream; + + { + system::ISystem::future_t> fut; + system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ); + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); + ASSERT_SUCCESS_NV(res); + ptx = std::move(ptx_); + } + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + constexpr uint32_t gridDim[3] = { 4096,1,1 }; + constexpr uint32_t blockDim[3] = { 1024,1,1 }; + int numElements = gridDim[0] * blockDim[0]; + auto _size = sizeof(float) * numElements; + + core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(_size), + core::make_smart_refctd_ptr(_size), + core::make_smart_refctd_ptr(_size) }; + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); + + auto createBuffer = [&](int idx) { + + CUexternalMemory mem = 0; + CUdeviceptr ptr = 0; + + auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], + {{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, + {{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, + cpubuffers[idx]->getPointer()); + + assert(buf); + CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, + .handle = {.win32 = {.handle = buf->getExternalHandle()}}, + .size = buf->getMemoryReqs().size, + }; + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size }; + ASSERT_SUCCESS(cu.pcuImportExternalMemory(&mem, &handleDesc)); + ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(&ptr, mem, &bufferDesc)); + return std::tuple{buf, mem, ptr}; + }; + + auto [buf0, mem0, ptr0] = createBuffer(0); + auto [buf1, mem1, ptr1] = createBuffer(1); + auto [buf2, mem2, ptr2] = createBuffer(2); + + void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; + + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream)); + ASSERT_SUCCESS(cu.pcuCtxSynchronize()); + + float* A = reinterpret_cast(cpubuffers[0]->getPointer()); + float* B = reinterpret_cast(cpubuffers[1]->getPointer()); + float* C = reinterpret_cast(cpubuffers[2]->getPointer()); + for (auto i = 0; i < numElements; i++) + assert(abs(C[i] - A[i] - B[i]) < 0.01f); + + return 0; +} diff --git a/63.CUDAInterop/pipeline.groovy b/63.CUDAInterop/pipeline.groovy new file mode 100644 index 000000000..29909e3d1 --- /dev/null +++ b/63.CUDAInterop/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CCUDAInteropBuilder extends IBuilder +{ + public CCUDAInteropBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CCUDAInteropBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/63.CUDAInterop/vectorAdd_kernel.cu b/63.CUDAInterop/vectorAdd_kernel.cu new file mode 100644 index 000000000..3baef0123 --- /dev/null +++ b/63.CUDAInterop/vectorAdd_kernel.cu @@ -0,0 +1,42 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ + +extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, + int numElements) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) { + C[i] = A[i] + B[i]; + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 832f1eebd..dee7c30aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ add_subdirectory(61.BlitFilterTest EXCLUDE_FROM_ALL) add_subdirectory(61.UI EXCLUDE_FROM_ALL) add_subdirectory(62.CAD EXCLUDE_FROM_ALL) add_subdirectory(62.SchusslerTest EXCLUDE_FROM_ALL) +add_subdirectory(63.CUDAInterop EXCLUDE_FROM_ALL) add_subdirectory(0.ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 unset(NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET CACHE) From c90032d151f148322992a8b65aac9b39cbeda106 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 17:16:14 +0300 Subject: [PATCH 02/17] proper cleanup --- 63.CUDAInterop/main.cpp | 92 +++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index a1c38629b..c5297830d 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -58,8 +58,8 @@ int main(int argc, char** argv) auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); assert(cudaHandler); auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); - auto& cu = cudaHandler->getCUDAFunctionTable(); + core::smart_refctd_ptr ptx; CUmodule module; CUfunction kernel; @@ -72,7 +72,7 @@ int main(int argc, char** argv) ASSERT_SUCCESS_NV(res); ptx = std::move(ptx_); } - + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); @@ -89,43 +89,65 @@ int main(int argc, char** argv) for (auto i = 0; i < numElements; i++) reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); - auto createBuffer = [&](int idx) { + { + auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { + struct CUCleaner : video::ICleanup + { + CUexternalMemory mem = nullptr; + CUdeviceptr ptr = {}; + core::smart_refctd_ptr cudaHandler = nullptr; + core::smart_refctd_ptr cudaDevice = nullptr; + + ~CUCleaner() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); + ASSERT_SUCCESS(cu.pcuMemFree_v2(ptr)); + ASSERT_SUCCESS(cu.pcuDestroyExternalMemory(mem)); + } + }; + + auto cleaner = std::make_unique(); + cleaner->cudaHandler = cudaHandler; + cleaner->cudaDevice = cudaDevice; + CUexternalMemory* mem = &cleaner->mem; + CUdeviceptr* ptr = &cleaner->ptr; + auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], + {{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, + {{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, + cpuBuf->getPointer()); + assert(buf.get()); + CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, + .handle = {.win32 = {.handle = buf->getExternalHandle()}}, + .size = buf->getMemoryReqs().size, + }; + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size }; + ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc)); + ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc)); + return std::tuple< core::smart_refctd_ptr, CUexternalMemory, CUdeviceptr>{std::move(buf), *mem, *ptr}; + }; - CUexternalMemory mem = 0; - CUdeviceptr ptr = 0; + auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]); + auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]); + auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]); + + void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; + + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream)); + ASSERT_SUCCESS(cu.pcuCtxSynchronize()); - auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], - {{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, - cpubuffers[idx]->getPointer()); + float* A = reinterpret_cast(cpubuffers[0]->getPointer()); + float* B = reinterpret_cast(cpubuffers[1]->getPointer()); + float* C = reinterpret_cast(cpubuffers[2]->getPointer()); - assert(buf); - CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { - .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, - .handle = {.win32 = {.handle = buf->getExternalHandle()}}, - .size = buf->getMemoryReqs().size, - }; - CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size }; - ASSERT_SUCCESS(cu.pcuImportExternalMemory(&mem, &handleDesc)); - ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(&ptr, mem, &bufferDesc)); - return std::tuple{buf, mem, ptr}; - }; + for (auto i = 0; i < numElements; i++) + assert(abs(C[i] - A[i] - B[i]) < 0.01f); + } + + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - auto [buf0, mem0, ptr0] = createBuffer(0); - auto [buf1, mem1, ptr1] = createBuffer(1); - auto [buf2, mem2, ptr2] = createBuffer(2); - - void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; - - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream)); - ASSERT_SUCCESS(cu.pcuCtxSynchronize()); - - float* A = reinterpret_cast(cpubuffers[0]->getPointer()); - float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* C = reinterpret_cast(cpubuffers[2]->getPointer()); - for (auto i = 0; i < numElements; i++) - assert(abs(C[i] - A[i] - B[i]) < 0.01f); return 0; } From 6d0a5776f63c2c9dda2c40b9262c4dc962d49421 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 20:55:14 +0300 Subject: [PATCH 03/17] add cuda to vulkan example --- 63.CUDAInterop/main.cpp | 224 ++++++++++++++++++++++++++++++---------- 1 file changed, 170 insertions(+), 54 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index c5297830d..a939bac09 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -37,58 +37,29 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \ abort(); \ } +constexpr uint32_t gridDim[3] = { 4096,1,1 }; +constexpr uint32_t blockDim[3] = { 1024,1,1 }; +size_t numElements = gridDim[0] * blockDim[0]; +size_t size = sizeof(float) * numElements; -int main(int argc, char** argv) +void vk2cuda( + core::smart_refctd_ptr cudaHandler, + core::smart_refctd_ptr cudaDevice, + video::IUtilities * util, + video::ILogicalDevice* logicalDevice, + nbl::video::IGPUQueue ** queues, + CUfunction kernel, + CUstream stream, + int=0) { - auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ - .appName = { "63.CUDAInterop" }, - .apiType = video::EAT_VULKAN, - .swapchainImageUsage = nbl::asset::IImage::EUF_NONE, - }); - - auto& system = initOutput.system; - auto& apiConnection = initOutput.apiConnection; - auto& physicalDevice = initOutput.physicalDevice; - auto& logicalDevice = initOutput.logicalDevice; - auto& utilities = initOutput.utilities; - auto& queues = initOutput.queues; - auto& logger = initOutput.logger; - - assert(physicalDevice->getLimits().externalMemory); - auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); - assert(cudaHandler); - auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); auto& cu = cudaHandler->getCUDAFunctionTable(); - - core::smart_refctd_ptr ptx; - CUmodule module; - CUfunction kernel; - CUstream stream; - - { - system::ISystem::future_t> fut; - system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ); - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); - ASSERT_SUCCESS_NV(res); - ptx = std::move(ptx_); - } - - ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); - ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); - ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - - constexpr uint32_t gridDim[3] = { 4096,1,1 }; - constexpr uint32_t blockDim[3] = { 1024,1,1 }; - int numElements = gridDim[0] * blockDim[0]; - auto _size = sizeof(float) * numElements; - - core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(_size), - core::make_smart_refctd_ptr(_size), - core::make_smart_refctd_ptr(_size) }; + core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(size), + core::make_smart_refctd_ptr(size), + core::make_smart_refctd_ptr(size) }; for (auto j = 0; j < 2; j++) for (auto i = 0; i < numElements; i++) reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); - + { auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { struct CUCleaner : video::ICleanup @@ -111,9 +82,9 @@ int main(int argc, char** argv) cleaner->cudaDevice = cudaDevice; CUexternalMemory* mem = &cleaner->mem; CUdeviceptr* ptr = &cleaner->ptr; - auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], - {{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, + auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], + { {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, + {{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, cpuBuf->getPointer()); assert(buf.get()); CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { @@ -124,17 +95,17 @@ int main(int argc, char** argv) CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size }; ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc)); ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc)); - return std::tuple< core::smart_refctd_ptr, CUexternalMemory, CUdeviceptr>{std::move(buf), *mem, *ptr}; + return std::tuple< core::smart_refctd_ptr, CUexternalMemory, CUdeviceptr>{std::move(buf), * mem, * ptr}; }; auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]); auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]); auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]); - + void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; - + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, size, stream)); ASSERT_SUCCESS(cu.pcuCtxSynchronize()); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); @@ -144,10 +115,155 @@ int main(int argc, char** argv) for (auto i = 0; i < numElements; i++) assert(abs(C[i] - A[i] - B[i]) < 0.01f); } +} + +void cuda2vk( + core::smart_refctd_ptr cudaHandler, + core::smart_refctd_ptr cudaDevice, + video::IUtilities* util, + video::ILogicalDevice* logicalDevice, + nbl::video::IGPUQueue** queues, + CUfunction kernel, + CUstream stream, + int = 0) +{ + auto& cu = cudaHandler->getCUDAFunctionTable(); + + core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(size), + core::make_smart_refctd_ptr(size), + core::make_smart_refctd_ptr(size) }; + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); + + { + auto createBuffer = [&]{ + struct CUCleaner : video::ICleanup + { + CUdeviceptr ptr = {}; + size_t sz = 0; + CUmemGenericAllocationHandle mem = 0; + core::smart_refctd_ptr cudaHandler = nullptr; + core::smart_refctd_ptr cudaDevice = nullptr; + + ~CUCleaner() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); + ASSERT_SUCCESS(cu.pcuMemUnmap(ptr, sz)); + ASSERT_SUCCESS(cu.pcuMemAddressFree(ptr, sz)); + ASSERT_SUCCESS(cu.pcuMemRelease(mem)); + } + }; + auto& cu = cudaHandler->getCUDAFunctionTable(); + auto cleaner = std::make_unique(); + cleaner->cudaHandler = cudaHandler; + cleaner->cudaDevice = cudaDevice; + + CUdeviceptr* ptr = &cleaner->ptr; + CUmemGenericAllocationHandle* mem = &cleaner->mem; + void* handle = 0; + size_t* sz = &cleaner->sz; + size_t granularity = 0; + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32, + .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = cudaDevice->getInternalObject() }, + .win32HandleMetaData = metaData, + }; + + ASSERT_SUCCESS(cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + *sz = ((size - 1) / granularity + 1) * granularity; + ASSERT_SUCCESS(cu.pcuMemCreate(mem, *sz, &prop, 0)); + ASSERT_SUCCESS(cu.pcuMemExportToShareableHandle(&handle, *mem, CU_MEM_HANDLE_TYPE_WIN32, 0)); + ASSERT_SUCCESS(cu.pcuMemAddressReserve(ptr, *sz, 0, 0, 0)); + ASSERT_SUCCESS(cu.pcuMemMap(*ptr, *sz, 0, *mem, 0)); + CUmemAccessDesc accessDesc = { + .location = prop.location, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; + ASSERT_SUCCESS(cu.pcuMemSetAccess(*ptr, *sz, &accessDesc, 1)); + + auto buf = logicalDevice->createBuffer( + { {.size = *sz, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT }, + { {.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = handle}} }); + + assert(buf.get()); + + auto reqs = buf->getMemoryReqs(); + reqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); + auto alloc = logicalDevice->allocate(reqs, buf.get()); + assert(alloc.memory && alloc.offset != video::IDeviceMemoryAllocator::InvalidMemoryOffset); + return std::tuple< core::smart_refctd_ptr, CUdeviceptr>{std::move(buf), * ptr}; + }; + + auto [buf0, ptr0] = createBuffer(); + auto [buf1, ptr1] = createBuffer(); + auto [buf2, ptr2] = createBuffer(); + + void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; + + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr0, cpubuffers[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr1, cpubuffers[1]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuCtxSynchronize()); + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + ASSERT_SUCCESS(cu.pcuCtxSynchronize()); + util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{.offset = 0, .size = size, .buffer = buf2}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]); + + float* A = reinterpret_cast(cpubuffers[0]->getPointer()); + float* B = reinterpret_cast(cpubuffers[1]->getPointer()); + float* C = reinterpret_cast(cpubuffers[2]->getPointer()); + + for (auto i = 0; i < numElements; i++) + assert(abs(C[i] - A[i] - B[i]) < 0.01f); + } +} + +int main(int argc, char** argv) +{ + auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ + .appName = { "63.CUDAInterop" }, + .apiType = video::EAT_VULKAN, + .swapchainImageUsage = nbl::asset::IImage::EUF_NONE, + }); + + auto& system = initOutput.system; + auto& apiConnection = initOutput.apiConnection; + auto& physicalDevice = initOutput.physicalDevice; + auto& logicalDevice = initOutput.logicalDevice; + auto& utilities = initOutput.utilities; + auto& queues = initOutput.queues; + auto& logger = initOutput.logger; + + assert(physicalDevice->getLimits().externalMemory); + auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); + assert(cudaHandler); + auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); + auto& cu = cudaHandler->getCUDAFunctionTable(); + + core::smart_refctd_ptr ptx; + CUmodule module; + CUfunction kernel; + CUstream stream; + + { + system::ISystem::future_t> fut; + system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ); + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); + ASSERT_SUCCESS_NV(res); + ptx = std::move(ptx_); + } + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); + cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - return 0; } From 867b4352512a56496964a17e321aaf68c5f9afb2 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 9 Jul 2023 23:34:56 +0300 Subject: [PATCH 04/17] refactor --- 63.CUDAInterop/main.cpp | 130 ++++++++-------------------------------- 1 file changed, 25 insertions(+), 105 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index a939bac09..3f156e80d 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -62,50 +62,27 @@ void vk2cuda( { auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { - struct CUCleaner : video::ICleanup - { - CUexternalMemory mem = nullptr; - CUdeviceptr ptr = {}; - core::smart_refctd_ptr cudaHandler = nullptr; - core::smart_refctd_ptr cudaDevice = nullptr; - - ~CUCleaner() - { - auto& cu = cudaHandler->getCUDAFunctionTable(); - ASSERT_SUCCESS(cu.pcuMemFree_v2(ptr)); - ASSERT_SUCCESS(cu.pcuDestroyExternalMemory(mem)); - } - }; - - auto cleaner = std::make_unique(); - cleaner->cudaHandler = cudaHandler; - cleaner->cudaDevice = cudaDevice; - CUexternalMemory* mem = &cleaner->mem; - CUdeviceptr* ptr = &cleaner->ptr; auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], { {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, + {{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, cpuBuf->getPointer()); assert(buf.get()); - CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = { - .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, - .handle = {.win32 = {.handle = buf->getExternalHandle()}}, - .size = buf->getMemoryReqs().size, - }; - CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size }; - ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc)); - ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc)); - return std::tuple< core::smart_refctd_ptr, CUexternalMemory, CUdeviceptr>{std::move(buf), * mem, * ptr}; + return buf; }; - auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]); - auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]); - auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]); + auto buf0 = createBuffer(cpubuffers[0]); + auto buf1 = createBuffer(cpubuffers[1]); + auto buf2 = createBuffer(cpubuffers[2]); + + video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2; - void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf0.get(), &mem0)); + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf1.get(), &mem1)); + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf2.get(), &mem2)); + void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements }; ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem2.ptr, size, stream)); ASSERT_SUCCESS(cu.pcuCtxSynchronize()); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); @@ -117,6 +94,7 @@ void vk2cuda( } } + void cuda2vk( core::smart_refctd_ptr cudaHandler, core::smart_refctd_ptr cudaDevice, @@ -137,79 +115,20 @@ void cuda2vk( reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); { - auto createBuffer = [&]{ - struct CUCleaner : video::ICleanup - { - CUdeviceptr ptr = {}; - size_t sz = 0; - CUmemGenericAllocationHandle mem = 0; - core::smart_refctd_ptr cudaHandler = nullptr; - core::smart_refctd_ptr cudaDevice = nullptr; - - ~CUCleaner() - { - auto& cu = cudaHandler->getCUDAFunctionTable(); - ASSERT_SUCCESS(cu.pcuMemUnmap(ptr, sz)); - ASSERT_SUCCESS(cu.pcuMemAddressFree(ptr, sz)); - ASSERT_SUCCESS(cu.pcuMemRelease(mem)); - } - }; - auto& cu = cudaHandler->getCUDAFunctionTable(); - auto cleaner = std::make_unique(); - cleaner->cudaHandler = cudaHandler; - cleaner->cudaDevice = cudaDevice; - - CUdeviceptr* ptr = &cleaner->ptr; - CUmemGenericAllocationHandle* mem = &cleaner->mem; - void* handle = 0; - size_t* sz = &cleaner->sz; - size_t granularity = 0; - - uint32_t metaData[16] = { 48 }; - CUmemAllocationProp prop = { - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32, - .location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = cudaDevice->getInternalObject() }, - .win32HandleMetaData = metaData, - }; - - ASSERT_SUCCESS(cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - *sz = ((size - 1) / granularity + 1) * granularity; - ASSERT_SUCCESS(cu.pcuMemCreate(mem, *sz, &prop, 0)); - ASSERT_SUCCESS(cu.pcuMemExportToShareableHandle(&handle, *mem, CU_MEM_HANDLE_TYPE_WIN32, 0)); - ASSERT_SUCCESS(cu.pcuMemAddressReserve(ptr, *sz, 0, 0, 0)); - ASSERT_SUCCESS(cu.pcuMemMap(*ptr, *sz, 0, *mem, 0)); - CUmemAccessDesc accessDesc = { - .location = prop.location, - .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, - }; - ASSERT_SUCCESS(cu.pcuMemSetAccess(*ptr, *sz, &accessDesc, 1)); - - auto buf = logicalDevice->createBuffer( - { {.size = *sz, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT }, - { {.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = handle}} }); - - assert(buf.get()); - - auto reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto alloc = logicalDevice->allocate(reqs, buf.get()); - assert(alloc.memory && alloc.offset != video::IDeviceMemoryAllocator::InvalidMemoryOffset); - return std::tuple< core::smart_refctd_ptr, CUdeviceptr>{std::move(buf), * ptr}; - }; - - auto [buf0, ptr0] = createBuffer(); - auto [buf1, ptr1] = createBuffer(); - auto [buf2, ptr2] = createBuffer(); - - void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements }; - - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr0, cpubuffers[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr1, cpubuffers[1]->getPointer(), size, stream)); + video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2; + ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem0)); + ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem1)); + ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem2)); + + void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem0.ptr, cpubuffers[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem1.ptr, cpubuffers[1]->getPointer(), size, stream)); ASSERT_SUCCESS(cu.pcuCtxSynchronize()); ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); ASSERT_SUCCESS(cu.pcuCtxSynchronize()); - util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{.offset = 0, .size = size, .buffer = buf2}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]); + + auto buf = cudaDevice->exportGPUBuffer(mem2, logicalDevice); + util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{.offset = 0, .size = size, .buffer = buf}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); @@ -260,6 +179,7 @@ int main(int argc, char** argv) ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); + cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); ASSERT_SUCCESS(cu.pcuModuleUnload(module)); From 831e61e21f1ebe70458c3b3f77bc40791a5363d8 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 15 Jul 2023 23:21:14 +0300 Subject: [PATCH 05/17] semaphore interop --- 63.CUDAInterop/main.cpp | 194 ++++++++++++++++++++++++++++++---------- 1 file changed, 145 insertions(+), 49 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index 3f156e80d..2aef06a09 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -64,26 +64,27 @@ void vk2cuda( auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], { {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, + {{.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, cpuBuf->getPointer()); assert(buf.get()); return buf; }; - auto buf0 = createBuffer(cpubuffers[0]); - auto buf1 = createBuffer(cpubuffers[1]); - auto buf2 = createBuffer(cpubuffers[2]); - - video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2; + core::smart_refctd_ptr buf[3] = { + createBuffer(cpubuffers[0]), + createBuffer(cpubuffers[1]), + createBuffer(cpubuffers[2]), + }; - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf0.get(), &mem0)); - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf1.get(), &mem1)); - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf2.get(), &mem2)); + std::array, 3> mem = {}; + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get())); + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get())); + ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get())); - void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements }; + void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements }; ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem2.ptr, size, stream)); - ASSERT_SUCCESS(cu.pcuCtxSynchronize()); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem[2]->ptr, size, stream)); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); @@ -94,50 +95,146 @@ void vk2cuda( } } - -void cuda2vk( - core::smart_refctd_ptr cudaHandler, - core::smart_refctd_ptr cudaDevice, - video::IUtilities* util, - video::ILogicalDevice* logicalDevice, - nbl::video::IGPUQueue** queues, - CUfunction kernel, - CUstream stream, - int = 0) +struct CUDA2VK { - auto& cu = cudaHandler->getCUDAFunctionTable(); + core::smart_refctd_ptr cudaHandler; + core::smart_refctd_ptr cudaDevice; + video::IUtilities* util; + video::ILogicalDevice* logicalDevice; + nbl::video::IGPUQueue** queues; - core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(size), - core::make_smart_refctd_ptr(size), - core::make_smart_refctd_ptr(size) }; - for (auto j = 0; j < 2; j++) - for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); + std::array, 2> cpubuffers; + std::array, 3> mem = {}; + core::smart_refctd_ptr cusema; + core::smart_refctd_ptr importedbuf, stagingbuf; + core::smart_refctd_ptr sema; + core::smart_refctd_ptr commandPool; + core::smart_refctd_ptr cmd; + core::smart_refctd_ptr fence; + CUDA2VK( + core::smart_refctd_ptr _cudaHandler, + core::smart_refctd_ptr _cudaDevice, + video::IUtilities* _util, + video::ILogicalDevice* _logicalDevice, + video::IGPUQueue** _queues) + : cudaHandler(std::move(_cudaHandler)) + , cudaDevice(std::move(_cudaDevice)) + , util(_util) + , logicalDevice(_logicalDevice) + , queues(_queues) { - video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2; - ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem0)); - ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem1)); - ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem2)); - - void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem0.ptr, cpubuffers[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem1.ptr, cpubuffers[1]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuCtxSynchronize()); - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuCtxSynchronize()); + createResources(); + } + + void createResources() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); - auto buf = cudaDevice->exportGPUBuffer(mem2, logicalDevice); - util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange{.offset = 0, .size = size, .buffer = buf}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]); + for (auto& buf : cpubuffers) + buf = core::make_smart_refctd_ptr(size); + + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); + + sema = logicalDevice->createSemaphore({ .externalHandleType = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); + + ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float))); + ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float))); + ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float))); + importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice); + + fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); + commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {}); + bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd); + assert(re); + + stagingbuf = logicalDevice->createBuffer({ {.size = importedbuf->getSize(), .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = stagingbuf->getMemoryReqs(); + req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = logicalDevice->allocate(req, stagingbuf.get()); + assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); + assert(stagingbuf->getBoundMemory()->isMappable()); + logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(stagingbuf->getBoundMemory(), stagingbuf->getBoundMemoryOffset(), stagingbuf->getSize()), video::IDeviceMemoryAllocation::EMCAF_READ); + assert(stagingbuf->getBoundMemory()->getMappedPointer()); + memset(stagingbuf->getBoundMemory()->getMappedPointer(), 0, stagingbuf->getSize()); + } + + void launchKernel(CUfunction kernel, CUstream stream) + { + auto queue = queues[CommonAPI::InitOutput::EQT_COMPUTE]; + + auto& cu = cudaHandler->getCUDAFunctionTable(); + // Launch kernel + { + void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[0]->ptr, cpubuffers[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[1]->ptr, cpubuffers[1]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {}; + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&cusema->semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + } + + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + { + video::IGPUSemaphore* waitSemaphores[] = { sema.get() }; + asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT }; + video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() }; + + video::IGPUCommandBuffer::SBufferMemoryBarrier barrier = { + .barrier = { + .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH , + }, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, + .dstQueueFamilyIndex = queue->getFamilyIndex(), + .buffer = importedbuf, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + bool re = true; + re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); + re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); // Ownership transfer? + asset::SBufferCopy region = { .size = importedbuf->getSize() }; + re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); + re &= cmd->end(); + + video::IGPUQueue::SSubmitInfo submitInfo = { + .waitSemaphoreCount = 1, + .pWaitSemaphores = waitSemaphores, + .pWaitDstStageMask = waitStages, + .commandBufferCount = 1, + .commandBuffers = cmdBuffers + }; + + re &= queue->submit(1, &submitInfo, fence.get()); + assert(re); + } + + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); + } + + void kernelCallback() + { + // Make sure we are also done with the readback + { + video::IGPUFence* fences[] = { fence.get() }; + auto status = logicalDevice->waitForFences(1, fences, true, -1); + assert(video::IGPUFence::ES_SUCCESS == status); + } float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* C = reinterpret_cast(cpubuffers[2]->getPointer()); - + float* C = reinterpret_cast(stagingbuf->getBoundMemory()->getMappedPointer()); for (auto i = 0; i < numElements; i++) assert(abs(C[i] - A[i] - B[i]) < 0.01f); + + std::cout << "Success\n"; + + delete this; } -} +}; int main(int argc, char** argv) { @@ -159,7 +256,7 @@ int main(int argc, char** argv) auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); assert(cudaHandler); auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); - auto& cu = cudaHandler->getCUDAFunctionTable(); + auto& cu = cudaHandler->getCUDAFunctionTable(); core::smart_refctd_ptr ptx; CUmodule module; @@ -180,10 +277,9 @@ int main(int argc, char** argv) vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); - cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); - + (new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - return 0; } From 1396829b405f2a6068069f786cfcf5eec4cd3db7 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Mon, 17 Jul 2023 01:21:12 +0300 Subject: [PATCH 06/17] update --- 63.CUDAInterop/main.cpp | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index 2aef06a09..89b7be2c3 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -6,6 +6,8 @@ #include #include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" #include "../common/CommonAPI.h" @@ -64,7 +66,7 @@ void vk2cuda( auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], { {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, + {{.externalHandleTypes = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, cpuBuf->getPointer()); assert(buf.get()); return buf; @@ -76,14 +78,15 @@ void vk2cuda( createBuffer(cpubuffers[2]), }; - std::array, 3> mem = {}; + std::array, 3> mem = {}; ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get())); ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get())); ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get())); - void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements }; + CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() }; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements}; ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem[2]->ptr, size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptrs[2], size, stream)); ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); @@ -95,6 +98,7 @@ void vk2cuda( } } + struct CUDA2VK { core::smart_refctd_ptr cudaHandler; @@ -104,8 +108,8 @@ struct CUDA2VK nbl::video::IGPUQueue** queues; std::array, 2> cpubuffers; - std::array, 3> mem = {}; - core::smart_refctd_ptr cusema; + std::array, 3> mem = {}; + core::smart_refctd_ptr cusema; core::smart_refctd_ptr importedbuf, stagingbuf; core::smart_refctd_ptr sema; core::smart_refctd_ptr commandPool; @@ -138,14 +142,14 @@ struct CUDA2VK for (auto i = 0; i < numElements; i++) reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); - sema = logicalDevice->createSemaphore({ .externalHandleType = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); + sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float))); ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float))); ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float))); importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice); - + assert(importedbuf); fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {}); bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd); @@ -169,12 +173,14 @@ struct CUDA2VK auto& cu = cudaHandler->getCUDAFunctionTable(); // Launch kernel { - void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[0]->ptr, cpubuffers[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[1]->ptr, cpubuffers[1]->getPointer(), size, stream)); + CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() }; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream)); ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {}; - ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&cusema->semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + auto semaphore = cusema->getInternalObject(); + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore } // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA @@ -195,7 +201,7 @@ struct CUDA2VK }; bool re = true; re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); - re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); // Ownership transfer? + // re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); // Ownership transfer? asset::SBufferCopy region = { .size = importedbuf->getSize() }; re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); re &= cmd->end(); @@ -275,10 +281,12 @@ int main(int argc, char** argv) ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); + vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); (new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); return 0; From d7fb50ec91995dc5769d2900e5c5d20f5204124c Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 27 Jul 2023 18:25:08 +0300 Subject: [PATCH 07/17] texture sharing --- 63.CUDAInterop/main.cpp | 205 ++++++++++++++++++++++++---------------- 1 file changed, 122 insertions(+), 83 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index 89b7be2c3..b249ded89 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -39,66 +39,11 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \ abort(); \ } -constexpr uint32_t gridDim[3] = { 4096,1,1 }; -constexpr uint32_t blockDim[3] = { 1024,1,1 }; +constexpr uint32_t gridDim[3] = { 256,1,1 }; +constexpr uint32_t blockDim[3] = { 256,1,1 }; size_t numElements = gridDim[0] * blockDim[0]; size_t size = sizeof(float) * numElements; -void vk2cuda( - core::smart_refctd_ptr cudaHandler, - core::smart_refctd_ptr cudaDevice, - video::IUtilities * util, - video::ILogicalDevice* logicalDevice, - nbl::video::IGPUQueue ** queues, - CUfunction kernel, - CUstream stream, - int=0) -{ - auto& cu = cudaHandler->getCUDAFunctionTable(); - core::smart_refctd_ptr cpubuffers[3] = { core::make_smart_refctd_ptr(size), - core::make_smart_refctd_ptr(size), - core::make_smart_refctd_ptr(size) }; - for (auto j = 0; j < 2; j++) - for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); - - { - auto createBuffer = [&](core::smart_refctd_ptrconst& cpuBuf) { - auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], - { {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT}, - {{.externalHandleTypes = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} }, - cpuBuf->getPointer()); - assert(buf.get()); - return buf; - }; - - core::smart_refctd_ptr buf[3] = { - createBuffer(cpubuffers[0]), - createBuffer(cpubuffers[1]), - createBuffer(cpubuffers[2]), - }; - - std::array, 3> mem = {}; - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get())); - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get())); - ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get())); - - CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() }; - void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements}; - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptrs[2], size, stream)); - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); - - float* A = reinterpret_cast(cpubuffers[0]->getPointer()); - float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* C = reinterpret_cast(cpubuffers[2]->getPointer()); - - for (auto i = 0; i < numElements; i++) - assert(abs(C[i] - A[i] - B[i]) < 0.01f); - } -} - - struct CUDA2VK { core::smart_refctd_ptr cudaHandler; @@ -111,6 +56,7 @@ struct CUDA2VK std::array, 3> mem = {}; core::smart_refctd_ptr cusema; core::smart_refctd_ptr importedbuf, stagingbuf; + core::smart_refctd_ptr importedimg, stagingimg; core::smart_refctd_ptr sema; core::smart_refctd_ptr commandPool; core::smart_refctd_ptr cmd; @@ -138,32 +84,86 @@ struct CUDA2VK for (auto& buf : cpubuffers) buf = core::make_smart_refctd_ptr(size); - for (auto j = 0; j < 2; j++) - for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand(); + //for (auto j = 0; j < 2; j++) + // for (auto i = 0; i < numElements; i++) + // reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX); + + memset(cpubuffers[1]->getPointer(), 0, size); + uint16_t (*ptr)[2] = reinterpret_cast(cpubuffers[0]->getPointer()); + for (auto i = 0; i < gridDim[0]; i++) + for (auto j = 0; j < blockDim[0]; j++) + { + ptr[i * blockDim[0] + j][0] = i; + ptr[i * blockDim[0] + j][1] = j; + } sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); - ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float))); - ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float))); - ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float))); - importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); + + importedbuf = mem[2]->exportAsBuffer(logicalDevice, asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT); + importedimg = mem[2]->exportAsImage(logicalDevice, + asset::IImage::SCreationParams{ + .type = asset::IImage::ET_2D, + .samples = asset::IImage::ESCF_1_BIT, + .format = asset::EF_R32_SFLOAT, + .extent = { gridDim[0], blockDim[0], 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT, + }); + assert(importedbuf); fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {}); bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd); assert(re); - stagingbuf = logicalDevice->createBuffer({ {.size = importedbuf->getSize(), .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); - auto req = stagingbuf->getMemoryReqs(); - req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - auto allocation = logicalDevice->allocate(req, stagingbuf.get()); - assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); - assert(stagingbuf->getBoundMemory()->isMappable()); - logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(stagingbuf->getBoundMemory(), stagingbuf->getBoundMemoryOffset(), stagingbuf->getSize()), video::IDeviceMemoryAllocation::EMCAF_READ); - assert(stagingbuf->getBoundMemory()->getMappedPointer()); - memset(stagingbuf->getBoundMemory()->getMappedPointer(), 0, stagingbuf->getSize()); + auto createStaging = [logicalDevice=logicalDevice]() + { + auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = logicalDevice->allocate(req, buf.get()); + assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); + assert(buf->getBoundMemory()->isMappable()); + logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(buf->getBoundMemory(), buf->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ); + assert(buf->getBoundMemory()->getMappedPointer()); + memset(buf->getBoundMemory()->getMappedPointer(), 0, req.size); + return buf; + }; + + auto createStagingImg = [logicalDevice = logicalDevice]() + { + video::IGPUImage::SCreationParams params = {{ + .type = asset::IImage::ET_2D, + .samples = asset::IImage::ESCF_1_BIT, + .format = asset::EF_R32_SFLOAT, + .extent = { gridDim[0], blockDim[0], 1 }, + .mipLevels = 1, + .arrayLayers = 1, + .usage = asset::IImage::EUF_TRANSFER_DST_BIT + }}; + params.tiling = video::IGPUImage::ET_LINEAR; + + auto img = logicalDevice->createImage(std::move(params)); + auto req = img->getMemoryReqs(); + req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = logicalDevice->allocate(req, img.get()); + assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); + assert(img->getBoundMemory()->isMappable()); + size_t sz = img->getImageDataSizeInBytes(); + logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(img->getBoundMemory(), img->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ); + assert(img->getBoundMemory()->getMappedPointer()); + memset(img->getBoundMemory()->getMappedPointer(), 0, req.size); + return img; + }; + + stagingbuf = createStaging(); + stagingimg = createStagingImg(); } void launchKernel(CUfunction kernel, CUstream stream) @@ -189,21 +189,54 @@ struct CUDA2VK asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT }; video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() }; - video::IGPUCommandBuffer::SBufferMemoryBarrier barrier = { - .barrier = { - .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH , - }, + video::IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, .dstQueueFamilyIndex = queue->getFamilyIndex(), .buffer = importedbuf, .offset = 0, .size = VK_WHOLE_SIZE, }; + + video::IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[2] = {{ + .barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, + .newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, + .dstQueueFamilyIndex = queue->getFamilyIndex(), + .image = importedimg, + .subresourceRange = { + .aspectMask = asset::IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + }}, { + .barrier = {.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, + .newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL, + .image = stagingimg, + .subresourceRange = { + .aspectMask = asset::IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + }} + }; + bool re = true; re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); - // re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); // Ownership transfer? + /*Acquire?*/ + re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 2u, imgBarriers); // Ownership transfer? asset::SBufferCopy region = { .size = importedbuf->getSize() }; re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); + asset::IImage::SImageCopy imgRegion = { + .srcSubresource = { + .aspectMask = imgBarriers[0].subresourceRange.aspectMask, + .layerCount = imgBarriers[0].subresourceRange.layerCount, + }, + .dstSubresource = { + .aspectMask = imgBarriers[1].subresourceRange.aspectMask, + .layerCount = imgBarriers[1].subresourceRange.layerCount, + }, + .extent = importedimg->getCreationParameters().extent + }; + re &= cmd->copyImage(importedimg.get(), imgBarriers[0].newLayout, stagingimg.get(), imgBarriers[1].newLayout, 1, &imgRegion); re &= cmd->end(); video::IGPUQueue::SSubmitInfo submitInfo = { @@ -217,7 +250,7 @@ struct CUDA2VK re &= queue->submit(1, &submitInfo, fence.get()); assert(re); } - + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); } @@ -232,9 +265,16 @@ struct CUDA2VK float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* C = reinterpret_cast(stagingbuf->getBoundMemory()->getMappedPointer()); + float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory()->getMappedPointer()); + float* CImg = reinterpret_cast(stagingimg->getBoundMemory()->getMappedPointer()); + + assert(!memcmp(CBuf, CImg, size)); + for (auto i = 0; i < numElements; i++) - assert(abs(C[i] - A[i] - B[i]) < 0.01f); + { + assert(abs(CBuf[i] - A[i] - B[i]) < 0.01f); + assert(abs(CImg[i] - A[i] - B[i]) < 0.01f); + } std::cout << "Success\n"; @@ -258,6 +298,7 @@ int main(int argc, char** argv) auto& queues = initOutput.queues; auto& logger = initOutput.logger; + assert(physicalDevice->getLimits().externalMemory); auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); assert(cudaHandler); @@ -281,8 +322,6 @@ int main(int argc, char** argv) ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - - vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream); (new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream); ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); From 682a8c1e685afea9d72c2984ddea615e12d250ce Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 6 Aug 2023 23:38:12 +0300 Subject: [PATCH 08/17] update to latest changes --- 63.CUDAInterop/main.cpp | 173 ++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 97 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index b249ded89..58a6c0c62 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -39,8 +39,8 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \ abort(); \ } -constexpr uint32_t gridDim[3] = { 256,1,1 }; -constexpr uint32_t blockDim[3] = { 256,1,1 }; +constexpr uint32_t gridDim[3] = { 4096,1,1 }; +constexpr uint32_t blockDim[3] = { 1024,1,1 }; size_t numElements = gridDim[0] * blockDim[0]; size_t size = sizeof(float) * numElements; @@ -55,8 +55,9 @@ struct CUDA2VK std::array, 2> cpubuffers; std::array, 3> mem = {}; core::smart_refctd_ptr cusema; - core::smart_refctd_ptr importedbuf, stagingbuf; - core::smart_refctd_ptr importedimg, stagingimg; + + core::smart_refctd_ptr importedbuf, stagingbuf, stagingbuf2; + core::smart_refctd_ptr importedimg; core::smart_refctd_ptr sema; core::smart_refctd_ptr commandPool; core::smart_refctd_ptr cmd; @@ -84,39 +85,46 @@ struct CUDA2VK for (auto& buf : cpubuffers) buf = core::make_smart_refctd_ptr(size); - //for (auto j = 0; j < 2; j++) - // for (auto i = 0; i < numElements; i++) - // reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX); - - memset(cpubuffers[1]->getPointer(), 0, size); - uint16_t (*ptr)[2] = reinterpret_cast(cpubuffers[0]->getPointer()); - for (auto i = 0; i < gridDim[0]; i++) - for (auto j = 0; j < blockDim[0]; j++) - { - ptr[i * blockDim[0] + j][0] = i; - ptr[i * blockDim[0] + j][1] = j; - } + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX); sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE })); - - importedbuf = mem[2]->exportAsBuffer(logicalDevice, asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT); - importedimg = mem[2]->exportAsImage(logicalDevice, - asset::IImage::SCreationParams{ - .type = asset::IImage::ET_2D, - .samples = asset::IImage::ESCF_1_BIT, - .format = asset::EF_R32_SFLOAT, - .extent = { gridDim[0], blockDim[0], 1 }, - .mipLevels = 1, - .arrayLayers = 1, - .usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT, - }); - - assert(importedbuf); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + + { + auto devmemory = mem[2]->exportAsMemory(logicalDevice); + assert(devmemory); + video::IGPUBuffer::SCreationParams params = {}; + params.size = devmemory->getAllocationSize(); + params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + importedbuf = logicalDevice->createBuffer(std::move(params)); + assert(importedbuf); + bool re = logicalDevice->bindBufferMemory(video::ILogicalDevice::SBindBufferMemoryInfo{.buffer = importedbuf.get(), .memory = devmemory.get() }); + assert(re); + } + + { + + video::IGPUImage::SCreationParams params = {}; + params.type = video::IGPUImage::ET_2D; + params.samples = video::IGPUImage::ESCF_1_BIT; + params.format = asset::EF_R32_SFLOAT; + params.extent = { gridDim[0], blockDim[0], 1 }; + params.mipLevels = 1; + params.arrayLayers = 1; + params.usage = video::IGPUImage::EUF_STORAGE_BIT | video::IGPUImage::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + params.tiling = video::IGPUImage::ET_LINEAR; + importedimg = mem[2]->exportAsImage(logicalDevice, std::move(params)); + assert(importedimg); + } + fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {}); bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd); @@ -136,34 +144,8 @@ struct CUDA2VK return buf; }; - auto createStagingImg = [logicalDevice = logicalDevice]() - { - video::IGPUImage::SCreationParams params = {{ - .type = asset::IImage::ET_2D, - .samples = asset::IImage::ESCF_1_BIT, - .format = asset::EF_R32_SFLOAT, - .extent = { gridDim[0], blockDim[0], 1 }, - .mipLevels = 1, - .arrayLayers = 1, - .usage = asset::IImage::EUF_TRANSFER_DST_BIT - }}; - params.tiling = video::IGPUImage::ET_LINEAR; - - auto img = logicalDevice->createImage(std::move(params)); - auto req = img->getMemoryReqs(); - req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - auto allocation = logicalDevice->allocate(req, img.get()); - assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); - assert(img->getBoundMemory()->isMappable()); - size_t sz = img->getImageDataSizeInBytes(); - logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(img->getBoundMemory(), img->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ); - assert(img->getBoundMemory()->getMappedPointer()); - memset(img->getBoundMemory()->getMappedPointer(), 0, req.size); - return img; - }; - stagingbuf = createStaging(); - stagingimg = createStagingImg(); + stagingbuf2 = createStaging(); } void launchKernel(CUfunction kernel, CUstream stream) @@ -173,7 +155,11 @@ struct CUDA2VK auto& cu = cudaHandler->getCUDAFunctionTable(); // Launch kernel { - CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() }; + CUdeviceptr ptrs[] = { + mem[0]->getDeviceptr(), + mem[1]->getDeviceptr(), + mem[2]->getDeviceptr(), + }; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream)); ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream)); @@ -182,7 +168,7 @@ struct CUDA2VK auto semaphore = cusema->getInternalObject(); ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore } - + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { video::IGPUSemaphore* waitSemaphores[] = { sema.get() }; @@ -195,11 +181,19 @@ struct CUDA2VK .dstQueueFamilyIndex = queue->getFamilyIndex(), .buffer = importedbuf, .offset = 0, - .size = VK_WHOLE_SIZE, + .size = size, }; - video::IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[2] = {{ + bool re = true; + re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); + /*Acquire?*/ + re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 0u, nullptr); // Ownership transfer? + asset::SBufferCopy region = { .size = size }; + re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); + + video::IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { .barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, + .oldLayout = asset::IImage::EL_PREINITIALIZED, .newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, .dstQueueFamilyIndex = queue->getFamilyIndex(), @@ -208,35 +202,20 @@ struct CUDA2VK .aspectMask = asset::IImage::EAF_COLOR_BIT, .levelCount = 1u, .layerCount = 1u, - }}, { - .barrier = {.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, - .newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL, - .image = stagingimg, - .subresourceRange = { - .aspectMask = asset::IImage::EAF_COLOR_BIT, - .levelCount = 1u, - .layerCount = 1u, - }} + } }; - bool re = true; - re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); - /*Acquire?*/ - re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 2u, imgBarriers); // Ownership transfer? - asset::SBufferCopy region = { .size = importedbuf->getSize() }; - re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); - asset::IImage::SImageCopy imgRegion = { - .srcSubresource = { - .aspectMask = imgBarriers[0].subresourceRange.aspectMask, - .layerCount = imgBarriers[0].subresourceRange.layerCount, - }, - .dstSubresource = { - .aspectMask = imgBarriers[1].subresourceRange.aspectMask, - .layerCount = imgBarriers[1].subresourceRange.layerCount, + re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, &imgBarrier); + + asset::IImage::SBufferCopy imgRegion = { + .imageSubresource = { + .aspectMask = imgBarrier.subresourceRange.aspectMask, + .layerCount = imgBarrier.subresourceRange.layerCount, }, - .extent = importedimg->getCreationParameters().extent + .imageExtent = importedimg->getCreationParameters().extent, }; - re &= cmd->copyImage(importedimg.get(), imgBarriers[0].newLayout, stagingimg.get(), imgBarriers[1].newLayout, 1, &imgRegion); + + re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion); re &= cmd->end(); video::IGPUQueue::SSubmitInfo submitInfo = { @@ -266,9 +245,9 @@ struct CUDA2VK float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory()->getMappedPointer()); - float* CImg = reinterpret_cast(stagingimg->getBoundMemory()->getMappedPointer()); + float* CImg = reinterpret_cast(stagingbuf2->getBoundMemory()->getMappedPointer()); - assert(!memcmp(CBuf, CImg, size)); + assert(!memcmp(CBuf, CImg, size)); for (auto i = 0; i < numElements; i++) { @@ -277,8 +256,6 @@ struct CUDA2VK } std::cout << "Success\n"; - - delete this; } }; @@ -297,7 +274,6 @@ int main(int argc, char** argv) auto& utilities = initOutput.utilities; auto& queues = initOutput.queues; auto& logger = initOutput.logger; - assert(physicalDevice->getLimits().externalMemory); auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); @@ -322,11 +298,14 @@ int main(int argc, char** argv) ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - (new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream); - - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + { + auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()); + cuda2vk.launchKernel(kernel, stream); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + } ASSERT_SUCCESS(cu.pcuModuleUnload(module)); ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + return 0; } From 6ce21d5c5c8026b6772f3e60e21096ee54353a81 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 4 Jan 2024 18:41:31 +0300 Subject: [PATCH 09/17] cuda-interop-vk13 --- 63.CUDAInterop/main.cpp | 400 +++++++++++++++++++------------ common/MonoDeviceApplication.hpp | 6 +- 2 files changed, 253 insertions(+), 153 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index 58a6c0c62..0b30b9227 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -9,14 +9,13 @@ #include "nbl/video/CCUDASharedMemory.h" #include "nbl/video/CCUDASharedSemaphore.h" -#include "../common/CommonAPI.h" +#include "../common./MonoSystemMonoLoggerApplication.hpp" -/** -This example just shows a screen which clears to red, -nothing fancy, just to show that Irrlicht links fine -**/ using namespace nbl; - +using namespace core; +using namespace system; +using namespace asset; +using namespace video; /* The start of the main function starts like in most other example. We ask the @@ -44,38 +43,124 @@ constexpr uint32_t blockDim[3] = { 1024,1,1 }; size_t numElements = gridDim[0] * blockDim[0]; size_t size = sizeof(float) * numElements; -struct CUDA2VK +#ifndef _NBL_COMPILE_WITH_CUDA_ +static_assert(false); +#endif + +class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication { - core::smart_refctd_ptr cudaHandler; - core::smart_refctd_ptr cudaDevice; - video::IUtilities* util; - video::ILogicalDevice* logicalDevice; - nbl::video::IGPUQueue** queues; - - std::array, 2> cpubuffers; - std::array, 3> mem = {}; - core::smart_refctd_ptr cusema; - - core::smart_refctd_ptr importedbuf, stagingbuf, stagingbuf2; - core::smart_refctd_ptr importedimg; - core::smart_refctd_ptr sema; - core::smart_refctd_ptr commandPool; - core::smart_refctd_ptr cmd; - core::smart_refctd_ptr fence; - - CUDA2VK( - core::smart_refctd_ptr _cudaHandler, - core::smart_refctd_ptr _cudaDevice, - video::IUtilities* _util, - video::ILogicalDevice* _logicalDevice, - video::IGPUQueue** _queues) - : cudaHandler(std::move(_cudaHandler)) - , cudaDevice(std::move(_cudaDevice)) - , util(_util) - , logicalDevice(_logicalDevice) - , queues(_queues) + using base_t = examples::MonoSystemMonoLoggerApplication; +public: + // Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor + using base_t::base_t; + + smart_refctd_ptr cudaHandler; + smart_refctd_ptr cudaDevice; + // IUtilities* util; + smart_refctd_ptr logicalDevice; + IQueue* queue; + + std::array, 2> cpubuffers; + std::array, 3> mem = {}; + smart_refctd_ptr cusema; + + smart_refctd_ptr importedbuf, stagingbuf, stagingbuf2; + smart_refctd_ptr importedimg; + smart_refctd_ptr sema; + smart_refctd_ptr commandPool; + smart_refctd_ptr cmd; + + + bool onAppInitialized(smart_refctd_ptr&& system) override { + // Remember to call the base class initialization! + if (!base_t::onAppInitialized(std::move(system))) + return false; + // `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why) + // use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead! + + // You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out. + smart_refctd_ptr api; + { + // You generally want to default initialize any parameter structs + IAPIConnection::SFeatures apiFeaturesToEnable = {}; + // generally you want to make your life easier during development + apiFeaturesToEnable.validations = true; + apiFeaturesToEnable.synchronizationValidation = true; + // want to make sure we have this so we can name resources for vieweing in RenderDoc captures + apiFeaturesToEnable.debugUtils = true; + // create our Vulkan instance + if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable))) + return logFail("Failed to crate an IAPIConnection!"); + } + + // We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue. + // Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them. + IPhysicalDevice* physDev = nullptr; + ILogicalDevice::SCreationParams params = {}; + // we will only deal with a single queue in this example + params.queueParamsCount = 1; + params.queueParams[0].count = 1; + params.featuresToEnable; + for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++) + { + const auto familyProps = (*physDevIt)->getQueueFamilyProperties(); + // this is the only "complicated" part, we want to create a queue that supports compute pipelines + for (auto i = 0; i < familyProps.size(); i++) + if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT)) + { + physDev = *physDevIt; + params.queueParams[0].familyIndex = i; + break; + } + } + if (!physDev) + return logFail("Failed to find any Physical Devices with Compute capable Queue Families!"); + + { + auto& limits = physDev->getLimits(); + if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32) + return logFail("Physical device does not support the required extensions"); + + cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(m_logger)); + assert(cudaHandler); + cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(api), physDev); + } + + // logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device + logicalDevice = physDev->createLogicalDevice(std::move(params)); + if (!logicalDevice) + return logFail("Failed to create a Logical Device!"); + + queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0); + createResources(); + + smart_refctd_ptr ptx; + { + ISystem::future_t> fut; + m_system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ); + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); + ASSERT_SUCCESS_NV(res); + ptx = std::move(ptx_); + } + CUmodule module; + CUfunction kernel; + CUstream stream; + + auto& cu = cudaHandler->getCUDAFunctionTable(); + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + launchKernel(kernel, stream); + + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + + return true; } void createResources() @@ -83,51 +168,52 @@ struct CUDA2VK auto& cu = cudaHandler->getCUDAFunctionTable(); for (auto& buf : cpubuffers) - buf = core::make_smart_refctd_ptr(size); + buf = make_smart_refctd_ptr(size); for (auto j = 0; j < 2; j++) for (auto i = 0; i < numElements; i++) reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX); - sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 }); - ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); { - auto devmemory = mem[2]->exportAsMemory(logicalDevice); + auto devmemory = mem[2]->exportAsMemory(logicalDevice.get()); assert(devmemory); - video::IGPUBuffer::SCreationParams params = {}; + IGPUBuffer::SCreationParams params = {}; params.size = devmemory->getAllocationSize(); params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; importedbuf = logicalDevice->createBuffer(std::move(params)); assert(importedbuf); - bool re = logicalDevice->bindBufferMemory(video::ILogicalDevice::SBindBufferMemoryInfo{.buffer = importedbuf.get(), .memory = devmemory.get() }); + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } }; + bool re = logicalDevice->bindBufferMemory(1, &bindInfo); assert(re); } { - video::IGPUImage::SCreationParams params = {}; - params.type = video::IGPUImage::ET_2D; - params.samples = video::IGPUImage::ESCF_1_BIT; - params.format = asset::EF_R32_SFLOAT; + IGPUImage::SCreationParams params = {}; + params.type = IGPUImage::ET_2D; + params.samples = IGPUImage::ESCF_1_BIT; + params.format = EF_R32_SFLOAT; params.extent = { gridDim[0], blockDim[0], 1 }; params.mipLevels = 1; params.arrayLayers = 1; - params.usage = video::IGPUImage::EUF_STORAGE_BIT | video::IGPUImage::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - params.tiling = video::IGPUImage::ET_LINEAR; - importedimg = mem[2]->exportAsImage(logicalDevice, std::move(params)); + params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + params.tiling = IGPUImage::TILING::LINEAR; + importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params)); assert(importedimg); } - fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED); - commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {}); - bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd); + commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {}); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger)); assert(re); auto createStaging = [logicalDevice=logicalDevice]() @@ -136,11 +222,11 @@ struct CUDA2VK auto req = buf->getMemoryReqs(); req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); auto allocation = logicalDevice->allocate(req, buf.get()); - assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset); - assert(buf->getBoundMemory()->isMappable()); - logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(buf->getBoundMemory(), buf->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ); - assert(buf->getBoundMemory()->getMappedPointer()); - memset(buf->getBoundMemory()->getMappedPointer(), 0, req.size); + assert(allocation.isValid() && buf->getBoundMemory().memory->isMappable()); + + bool re = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + assert(re && allocation.memory->getMappedPointer()); + memset(allocation.memory->getMappedPointer(), 0, req.size); return buf; }; @@ -150,8 +236,6 @@ struct CUDA2VK void launchKernel(CUfunction kernel, CUstream stream) { - auto queue = queues[CommonAPI::InitOutput::EQT_COMPUTE]; - auto& cu = cudaHandler->getCUDAFunctionTable(); // Launch kernel { @@ -164,50 +248,60 @@ struct CUDA2VK ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream)); ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream)); ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {}; + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } }; auto semaphore = cusema->getInternalObject(); ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + + std::string abc = "123"; } // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { - video::IGPUSemaphore* waitSemaphores[] = { sema.get() }; - asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT }; - video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() }; - - video::IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, - .dstQueueFamilyIndex = queue->getFamilyIndex(), - .buffer = importedbuf, - .offset = 0, - .size = size, + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + // .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + // .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = queue->getFamilyIndex(), + }, + .range = { .buffer = importedbuf, }, }; bool re = true; - re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT); - /*Acquire?*/ - re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 0u, nullptr); // Ownership transfer? - asset::SBufferCopy region = { .size = size }; + re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier}); + + IGPUCommandBuffer::SBufferCopy region = { .size = size }; re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); - video::IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - .barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH }, - .oldLayout = asset::IImage::EL_PREINITIALIZED, - .newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR, - .dstQueueFamilyIndex = queue->getFamilyIndex(), - .image = importedimg, + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .dep = { + // .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + // .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = queue->getFamilyIndex(), + }, + .image = importedimg.get(), .subresourceRange = { - .aspectMask = asset::IImage::EAF_COLOR_BIT, + .aspectMask = IImage::EAF_COLOR_BIT, .levelCount = 1u, .layerCount = 1u, - } + }, + .oldLayout = IImage::LAYOUT::PREINITIALIZED, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, }; - re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, &imgBarrier); + re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier }); - asset::IImage::SBufferCopy imgRegion = { + IImage::SBufferCopy imgRegion = { .imageSubresource = { .aspectMask = imgBarrier.subresourceRange.aspectMask, .layerCount = imgBarrier.subresourceRange.layerCount, @@ -217,16 +311,14 @@ struct CUDA2VK re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion); re &= cmd->end(); - - video::IGPUQueue::SSubmitInfo submitInfo = { - .waitSemaphoreCount = 1, - .pWaitSemaphores = waitSemaphores, - .pWaitDstStageMask = waitStages, - .commandBufferCount = 1, - .commandBuffers = cmdBuffers + + IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{ .semaphore = sema.get(), .value = 0, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}, + .commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}}, }; - re &= queue->submit(1, &submitInfo, fence.get()); + auto submitRe = queue->submit(std::array{submitInfo});; + re &= IQueue::RESULT::SUCCESS == submitRe; assert(re); } @@ -237,15 +329,15 @@ struct CUDA2VK { // Make sure we are also done with the readback { - video::IGPUFence* fences[] = { fence.get() }; - auto status = logicalDevice->waitForFences(1, fences, true, -1); - assert(video::IGPUFence::ES_SUCCESS == status); + //IGPUFence* fences[] = { fence.get() }; + //auto status = logicalDevice->waitForFences(1, fences, true, -1); + //assert(IGPUFence::ES_SUCCESS == status); } float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory()->getMappedPointer()); - float* CImg = reinterpret_cast(stagingbuf2->getBoundMemory()->getMappedPointer()); + float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory().memory->getMappedPointer()); + float* CImg = reinterpret_cast(stagingbuf2->getBoundMemory().memory->getMappedPointer()); assert(!memcmp(CBuf, CImg, size)); @@ -257,55 +349,63 @@ struct CUDA2VK std::cout << "Success\n"; } -}; - -int main(int argc, char** argv) -{ - auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ - .appName = { "63.CUDAInterop" }, - .apiType = video::EAT_VULKAN, - .swapchainImageUsage = nbl::asset::IImage::EUF_NONE, - }); - - auto& system = initOutput.system; - auto& apiConnection = initOutput.apiConnection; - auto& physicalDevice = initOutput.physicalDevice; - auto& logicalDevice = initOutput.logicalDevice; - auto& utilities = initOutput.utilities; - auto& queues = initOutput.queues; - auto& logger = initOutput.logger; - - assert(physicalDevice->getLimits().externalMemory); - auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr(logger)); - assert(cudaHandler); - auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); - auto& cu = cudaHandler->getCUDAFunctionTable(); - - core::smart_refctd_ptr ptx; - CUmodule module; - CUfunction kernel; - CUstream stream; - - { - system::ISystem::future_t> fut; - system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ); - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); - ASSERT_SUCCESS_NV(res); - ptx = std::move(ptx_); - } - ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); - ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); - ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. + bool keepRunning() override { return false; } - { - auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()); - cuda2vk.launchKernel(kernel, stream); - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); - } - - ASSERT_SUCCESS(cu.pcuModuleUnload(module)); - ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - - return 0; -} + // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" + void workLoopBody() override {} +}; +// +//int main(int argc, char** argv) +//{ +// auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ +// .appName = { "63.CUDAInterop" }, +// .apiType = EAT_VULKAN, +// .swapchainImageUsage = IImage::EUF_NONE, +// }); +// +// auto& system = initOutput.system; +// auto& apiConnection = initOutput.apiConnection; +// auto& physicalDevice = initOutput.physicalDevice; +// auto& logicalDevice = initOutput.logicalDevice; +// auto& utilities = initOutput.utilities; +// auto& queues = initOutput.queues; +// auto& logger = initOutput.logger; +// +// assert(physicalDevice->getLimits().externalMemory); +// auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(logger)); +// assert(cudaHandler); +// auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); +// auto& cu = cudaHandler->getCUDAFunctionTable(); +// +// smart_refctd_ptr ptx; +// CUmodule module; +// CUfunction kernel; +// CUstream stream; +// +// { +// ISystem::future_t> fut; +// system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ); +// /* auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); +// ASSERT_SUCCESS_NV(res); +// ptx = std::move(ptx_);*/ +// } +// +// //ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); +// //ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); +// //ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); +// +// //{ +// // auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()); +// // cuda2vk.launchKernel(kernel, stream); +// // ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); +// //} +// +// //ASSERT_SUCCESS(cu.pcuModuleUnload(module)); +// //ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); +// +// return 0; +//} + +NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp index 65fadad4b..9526bb101 100644 --- a/common/MonoDeviceApplication.hpp +++ b/common/MonoDeviceApplication.hpp @@ -266,7 +266,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication } // This will most certainly be overriden - using queue_family_range_t = core::SRange; + using queue_family_range_t = std::span; virtual core::vector getQueueCreationParameters(const queue_family_range_t& familyProperties) { using namespace video; @@ -282,12 +282,12 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication } // virtual to allow aliasing and total flexibility - virtual video::IGPUQueue* getComputeQueue() const + virtual video::IQueue* getComputeQueue() const { // In the default implementation of everything I asked only for one queue from first compute family const auto familyProperties = m_device->getPhysicalDevice()->getQueueFamilyProperties(); for (auto i=0u; igetQueue(i,0); return nullptr; From 415902575143a28cba08d677c73f1e917f3367cc Mon Sep 17 00:00:00 2001 From: atkurtul Date: Fri, 5 Jan 2024 01:01:21 +0300 Subject: [PATCH 10/17] fix sync --- 63.CUDAInterop/main.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp index 0b30b9227..11fae0d70 100644 --- a/63.CUDAInterop/main.cpp +++ b/63.CUDAInterop/main.cpp @@ -251,8 +251,6 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } }; auto semaphore = cusema->getInternalObject(); ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore - - std::string abc = "123"; } // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA @@ -311,13 +309,16 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion); re &= cmd->end(); - - IQueue::SSubmitInfo submitInfo = { - .waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{ .semaphore = sema.get(), .value = 0, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}, - .commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}}, - }; - - auto submitRe = queue->submit(std::array{submitInfo});; + + auto waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}; + auto signalSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}; + auto commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}}; + auto submitInfo = std::array{IQueue::SSubmitInfo { + .waitSemaphores = waitSemaphores, + .commandBuffers = commandBuffers, + .signalSemaphores = signalSemaphores, + }}; + auto submitRe = queue->submit(submitInfo); re &= IQueue::RESULT::SUCCESS == submitRe; assert(re); } @@ -328,11 +329,8 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication void kernelCallback() { // Make sure we are also done with the readback - { - //IGPUFence* fences[] = { fence.get() }; - //auto status = logicalDevice->waitForFences(1, fences, true, -1); - //assert(IGPUFence::ES_SUCCESS == status); - } + auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}}; + logicalDevice->waitForSemaphores(wait, true, -1); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); @@ -343,8 +341,8 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication for (auto i = 0; i < numElements; i++) { - assert(abs(CBuf[i] - A[i] - B[i]) < 0.01f); - assert(abs(CImg[i] - A[i] - B[i]) < 0.01f); + bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); + assert(re); } std::cout << "Success\n"; From 386608b0afc0010d4be9e66b00b684fd7ce57dfa Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 13 Jan 2024 22:21:50 +0300 Subject: [PATCH 11/17] rename 63 example folder --- {63.CUDAInterop => 63_CUDAInterop}/CMakeLists.txt | 0 {63.CUDAInterop => 63_CUDAInterop}/main.cpp | 2 +- {63.CUDAInterop => 63_CUDAInterop}/pipeline.groovy | 0 {63.CUDAInterop => 63_CUDAInterop}/vectorAdd_kernel.cu | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename {63.CUDAInterop => 63_CUDAInterop}/CMakeLists.txt (100%) rename {63.CUDAInterop => 63_CUDAInterop}/main.cpp (99%) rename {63.CUDAInterop => 63_CUDAInterop}/pipeline.groovy (100%) rename {63.CUDAInterop => 63_CUDAInterop}/vectorAdd_kernel.cu (100%) diff --git a/63.CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt similarity index 100% rename from 63.CUDAInterop/CMakeLists.txt rename to 63_CUDAInterop/CMakeLists.txt diff --git a/63.CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp similarity index 99% rename from 63.CUDAInterop/main.cpp rename to 63_CUDAInterop/main.cpp index 11fae0d70..8fa618e02 100644 --- a/63.CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -329,7 +329,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication void kernelCallback() { // Make sure we are also done with the readback - auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}}; + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}}; logicalDevice->waitForSemaphores(wait, true, -1); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); diff --git a/63.CUDAInterop/pipeline.groovy b/63_CUDAInterop/pipeline.groovy similarity index 100% rename from 63.CUDAInterop/pipeline.groovy rename to 63_CUDAInterop/pipeline.groovy diff --git a/63.CUDAInterop/vectorAdd_kernel.cu b/63_CUDAInterop/vectorAdd_kernel.cu similarity index 100% rename from 63.CUDAInterop/vectorAdd_kernel.cu rename to 63_CUDAInterop/vectorAdd_kernel.cu From f9a2d1be9fef023171aabbfc35776ce294c36bbf Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 14 Jan 2024 03:25:25 +0300 Subject: [PATCH 12/17] derive from monodevice --- 63_CUDAInterop/main.cpp | 75 +++++++++-------------------------------- 1 file changed, 15 insertions(+), 60 deletions(-) diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index 8fa618e02..9060907d9 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -9,7 +9,7 @@ #include "nbl/video/CCUDASharedMemory.h" #include "nbl/video/CCUDASharedSemaphore.h" -#include "../common./MonoSystemMonoLoggerApplication.hpp" +#include "../common/MonoDeviceApplication.hpp" using namespace nbl; using namespace core; @@ -47,17 +47,16 @@ size_t size = sizeof(float) * numElements; static_assert(false); #endif -class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication +class CUDA2VKApp : public examples::MonoDeviceApplication { - using base_t = examples::MonoSystemMonoLoggerApplication; + using base_t = examples::MonoDeviceApplication; public: // Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor using base_t::base_t; smart_refctd_ptr cudaHandler; smart_refctd_ptr cudaDevice; - // IUtilities* util; - smart_refctd_ptr logicalDevice; + IQueue* queue; std::array, 2> cpubuffers; @@ -76,63 +75,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication // Remember to call the base class initialization! if (!base_t::onAppInitialized(std::move(system))) return false; - // `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why) - // use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead! - - // You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out. - smart_refctd_ptr api; - { - // You generally want to default initialize any parameter structs - IAPIConnection::SFeatures apiFeaturesToEnable = {}; - // generally you want to make your life easier during development - apiFeaturesToEnable.validations = true; - apiFeaturesToEnable.synchronizationValidation = true; - // want to make sure we have this so we can name resources for vieweing in RenderDoc captures - apiFeaturesToEnable.debugUtils = true; - // create our Vulkan instance - if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable))) - return logFail("Failed to crate an IAPIConnection!"); - } - - // We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue. - // Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them. - IPhysicalDevice* physDev = nullptr; - ILogicalDevice::SCreationParams params = {}; - // we will only deal with a single queue in this example - params.queueParamsCount = 1; - params.queueParams[0].count = 1; - params.featuresToEnable; - for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++) - { - const auto familyProps = (*physDevIt)->getQueueFamilyProperties(); - // this is the only "complicated" part, we want to create a queue that supports compute pipelines - for (auto i = 0; i < familyProps.size(); i++) - if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT)) - { - physDev = *physDevIt; - params.queueParams[0].familyIndex = i; - break; - } - } - if (!physDev) - return logFail("Failed to find any Physical Devices with Compute capable Queue Families!"); { - auto& limits = physDev->getLimits(); + auto& limits = m_physicalDevice->getLimits(); if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32) return logFail("Physical device does not support the required extensions"); cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(m_logger)); assert(cudaHandler); - cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(api), physDev); + cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); } - // logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device - logicalDevice = physDev->createLogicalDevice(std::move(params)); - if (!logicalDevice) - return logFail("Failed to create a Logical Device!"); - queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0); + queue = base_t::getComputeQueue(); createResources(); @@ -180,19 +135,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); { - auto devmemory = mem[2]->exportAsMemory(logicalDevice.get()); + auto devmemory = mem[2]->exportAsMemory(m_device.get()); assert(devmemory); IGPUBuffer::SCreationParams params = {}; params.size = devmemory->getAllocationSize(); params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - importedbuf = logicalDevice->createBuffer(std::move(params)); + importedbuf = m_device->createBuffer(std::move(params)); assert(importedbuf); ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } }; - bool re = logicalDevice->bindBufferMemory(1, &bindInfo); + bool re = m_device->bindBufferMemory(1, &bindInfo); assert(re); } @@ -208,15 +163,15 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT; params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; params.tiling = IGPUImage::TILING::LINEAR; - importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params)); + importedimg = mem[2]->exportAsImage(m_device.get(), std::move(params)); assert(importedimg); } - commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {}); + commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {}); bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger)); assert(re); - auto createStaging = [logicalDevice=logicalDevice]() + auto createStaging = [logicalDevice= m_device]() { auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); auto req = buf->getMemoryReqs(); @@ -330,7 +285,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication { // Make sure we are also done with the readback auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}}; - logicalDevice->waitForSemaphores(wait, true, -1); + m_device->waitForSemaphores(wait, true, -1); float* A = reinterpret_cast(cpubuffers[0]->getPointer()); float* B = reinterpret_cast(cpubuffers[1]->getPointer()); From 138356a4a5e277859c9d156967c0187e45ca8e49 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sun, 14 Jan 2024 04:39:31 +0300 Subject: [PATCH 13/17] rewrite stuff --- 63_CUDAInterop/main.cpp | 68 +++++------------------------------------ 1 file changed, 7 insertions(+), 61 deletions(-) diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index 9060907d9..a14aac0af 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -115,6 +115,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication ASSERT_SUCCESS(cu.pcuModuleUnload(module)); ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + m_device->waitIdle(); return true; } @@ -160,10 +161,10 @@ class CUDA2VKApp : public examples::MonoDeviceApplication params.extent = { gridDim[0], blockDim[0], 1 }; params.mipLevels = 1; params.arrayLayers = 1; - params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT; + params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; params.tiling = IGPUImage::TILING::LINEAR; - importedimg = mem[2]->exportAsImage(m_device.get(), std::move(params)); + importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params)); assert(importedimg); } @@ -213,8 +214,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { .dep = { - // .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - // .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, }, @@ -226,7 +225,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication bool re = true; re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier}); + re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} }); IGPUCommandBuffer::SBufferCopy region = { .size = size }; re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); @@ -234,8 +233,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { .barrier = { .dep = { - // .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - // .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, }, @@ -248,11 +245,11 @@ class CUDA2VKApp : public examples::MonoDeviceApplication .levelCount = 1u, .layerCount = 1u, }, - .oldLayout = IImage::LAYOUT::PREINITIALIZED, + .oldLayout = IImage::LAYOUT::UNDEFINED, .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, }; - re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier }); + re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); IImage::SBufferCopy imgRegion = { .imageSubresource = { @@ -292,7 +289,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory().memory->getMappedPointer()); float* CImg = reinterpret_cast(stagingbuf2->getBoundMemory().memory->getMappedPointer()); - assert(!memcmp(CBuf, CImg, size)); + assert(!memcmp(CBuf, CImg, size)); for (auto i = 0; i < numElements; i++) { @@ -309,56 +306,5 @@ class CUDA2VKApp : public examples::MonoDeviceApplication // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" void workLoopBody() override {} }; -// -//int main(int argc, char** argv) -//{ -// auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{ -// .appName = { "63.CUDAInterop" }, -// .apiType = EAT_VULKAN, -// .swapchainImageUsage = IImage::EUF_NONE, -// }); -// -// auto& system = initOutput.system; -// auto& apiConnection = initOutput.apiConnection; -// auto& physicalDevice = initOutput.physicalDevice; -// auto& logicalDevice = initOutput.logicalDevice; -// auto& utilities = initOutput.utilities; -// auto& queues = initOutput.queues; -// auto& logger = initOutput.logger; -// -// assert(physicalDevice->getLimits().externalMemory); -// auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(logger)); -// assert(cudaHandler); -// auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(apiConnection), physicalDevice); -// auto& cu = cudaHandler->getCUDAFunctionTable(); -// -// smart_refctd_ptr ptx; -// CUmodule module; -// CUfunction kernel; -// CUstream stream; -// -// { -// ISystem::future_t> fut; -// system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ); -// /* auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); -// ASSERT_SUCCESS_NV(res); -// ptx = std::move(ptx_);*/ -// } -// -// //ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); -// //ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); -// //ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); -// -// //{ -// // auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()); -// // cuda2vk.launchKernel(kernel, stream); -// // ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); -// //} -// -// //ASSERT_SUCCESS(cu.pcuModuleUnload(module)); -// //ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); -// -// return 0; -//} NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file From db2ac71cd2e1a8f63cd3f99ba46bb77616153798 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 19:19:27 +0300 Subject: [PATCH 14/17] reflect nabla changes --- 63_CUDAInterop/main.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index a14aac0af..28c6d6f2f 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -77,10 +77,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication return false; { - auto& limits = m_physicalDevice->getLimits(); - if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32) - return logFail("Physical device does not support the required extensions"); - cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(m_logger)); assert(cudaHandler); cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); @@ -136,7 +132,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + sema = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); { auto devmemory = mem[2]->exportAsMemory(m_device.get()); @@ -154,7 +150,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication { - IGPUImage::SCreationParams params = {}; + IImage::SCreationParams params = {}; params.type = IGPUImage::ET_2D; params.samples = IGPUImage::ESCF_1_BIT; params.format = EF_R32_SFLOAT; @@ -162,8 +158,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication params.mipLevels = 1; params.arrayLayers = 1; params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - params.tiling = IGPUImage::TILING::LINEAR; importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params)); assert(importedimg); } From 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc Mon Sep 17 00:00:00 2001 From: atkurtul Date: Thu, 18 Jan 2024 23:27:02 +0300 Subject: [PATCH 15/17] address pr comments --- 03_DeviceSelectionAndSharedSources/main.cpp | 2 +- 22_CppCompat/test.hlsl.orig | 264 --------------- 63_CUDAInterop/CMakeLists.txt | 29 +- .../{ => app_resources}/vectorAdd_kernel.cu | 2 +- 63_CUDAInterop/main.cpp | 312 +++++++++++------- 5 files changed, 221 insertions(+), 388 deletions(-) delete mode 100644 22_CppCompat/test.hlsl.orig rename 63_CUDAInterop/{ => app_resources}/vectorAdd_kernel.cu (99%) diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index d9a8d9b1a..28872a5f5 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -32,7 +32,7 @@ class DeviceSelectionAndSharedSourcesApp final : public examples::MonoDeviceAppl bool onAppInitialized(smart_refctd_ptr&& system) override { // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(std::move(system))) + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; diff --git a/22_CppCompat/test.hlsl.orig b/22_CppCompat/test.hlsl.orig deleted file mode 100644 index b67b08062..000000000 --- a/22_CppCompat/test.hlsl.orig +++ /dev/null @@ -1,264 +0,0 @@ -//// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h - -#pragma shader_stage(compute) - -#define STATIC_ASSERT(C) { nbl::hlsl::conditional::type a = 0; } - -#define IS_SAME(L,R) nbl::hlsl::is_same::value -#define SHADER_CRASHING_ASSERT(expr) \ -{ \ - bool con = (expr); \ - do { \ - [branch] if (!con) \ - vk::RawBufferStore(0xdeadbeefBADC0FFbull,0x45u,4u); \ - } while(!con); \ -} - - -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include - -#include - -struct PushConstants -{ - uint2 imgSize; -}; - -[[vk::push_constant]] -PushConstants u_pushConstants; - -[[vk::binding(0, 0)]] Texture2D inImage; -[[vk::binding(1, 0)]] RWTexture2D outImage; -[[vk::binding(2, 0)]] Buffer inBuffer; -[[vk::binding(3, 0)]] RWStructuredBuffer outBuffer; - - -template -struct Spec -{ - static const int value = Spec::value + 1; -}; - -template<> -struct Spec<0> -{ - static const int value = 0; -}; - -Buffer unbounded[]; - -template -bool val(T) { return nbl::hlsl::is_unbounded_array::value; } - - -template -struct array -{ - T data[N]; -}; - -void fill(uint3 invocationID, float val) -{ - outImage[invocationID.xy] = float4(val,val,val,val); - outBuffer[invocationID.x * invocationID.y] = float4(val,val,val,val); -} - -void fill(uint3 invocationID, float4 val) -{ - outImage[invocationID.xy] = val; - outBuffer[invocationID.x * invocationID.y] = val; -} - - -[numthreads(8, 8, 1)] -void main(uint3 invocationID : SV_DispatchThreadID) -{ - fill(invocationID, 1); - const float32_t3 TEST_VEC = float32_t3(1.0f, 2.0f, 3.0f); - - fill(invocationID, 2); - // test functions from EOTF.hlsl - nbl::hlsl::colorspace::eotf::identity(TEST_VEC); - nbl::hlsl::colorspace::eotf::impl_shared_2_4(TEST_VEC, 0.5f); - nbl::hlsl::colorspace::eotf::sRGB(TEST_VEC); - nbl::hlsl::colorspace::eotf::Display_P3(TEST_VEC); - nbl::hlsl::colorspace::eotf::DCI_P3_XYZ(TEST_VEC); - nbl::hlsl::colorspace::eotf::SMPTE_170M(TEST_VEC); - nbl::hlsl::colorspace::eotf::SMPTE_ST2084(TEST_VEC); - nbl::hlsl::colorspace::eotf::HDR10_HLG(TEST_VEC); - nbl::hlsl::colorspace::eotf::AdobeRGB(TEST_VEC); - nbl::hlsl::colorspace::eotf::Gamma_2_2(TEST_VEC); - nbl::hlsl::colorspace::eotf::ACEScc(TEST_VEC); - nbl::hlsl::colorspace::eotf::ACEScct(TEST_VEC); - - fill(invocationID, 3); - - // test functions from OETF.hlsl - nbl::hlsl::colorspace::oetf::identity(TEST_VEC); - nbl::hlsl::colorspace::oetf::impl_shared_2_4(TEST_VEC, 0.5f); - nbl::hlsl::colorspace::oetf::sRGB(TEST_VEC); - nbl::hlsl::colorspace::oetf::Display_P3(TEST_VEC); - nbl::hlsl::colorspace::oetf::DCI_P3_XYZ(TEST_VEC); - nbl::hlsl::colorspace::oetf::SMPTE_170M(TEST_VEC); - nbl::hlsl::colorspace::oetf::SMPTE_ST2084(TEST_VEC); - nbl::hlsl::colorspace::oetf::HDR10_HLG(TEST_VEC); - nbl::hlsl::colorspace::oetf::AdobeRGB(TEST_VEC); - nbl::hlsl::colorspace::oetf::Gamma_2_2(TEST_VEC); - nbl::hlsl::colorspace::oetf::ACEScc(TEST_VEC); - nbl::hlsl::colorspace::oetf::ACEScct(TEST_VEC); - - fill(invocationID, 4); - // xoroshiro tests - const uint32_t2 state = uint32_t2(12u, 34u); - nbl::hlsl::Xoroshiro64Star xoroshiro64Star = nbl::hlsl::Xoroshiro64Star::construct(state); - xoroshiro64Star(); - nbl::hlsl::Xoroshiro64StarStar xoroshiro64StarStar = nbl::hlsl::Xoroshiro64StarStar::construct(state); - xoroshiro64StarStar(); - - //nbl::hlsl::mpl::countl_zero<2ull>::value; - - // TODO: test if std::rotl/r == nbl::hlsl::rotr/l == nbl::hlsl::mpl::rotr/l - - // uint32_t mplRotlResult0 = nbl::hlsl::mpl::rotl::value; - // uint32_t mplRotlResult1 = nbl::hlsl::mpl::rotl::value; - // uint32_t mplRotrResult0 = nbl::hlsl::mpl::rotr::value; - // uint32_t mplRotrResult1 = nbl::hlsl::mpl::rotr::value; - - // uint32_t rotlResult0 = nbl::hlsl::mpl::rotl::value; - // uint32_t rotlResult1 = nbl::hlsl::mpl::rotl::value; - // uint32_t rotrResult0 = nbl::hlsl::mpl::rotr::value; - // uint32_t rotrResult1 = nbl::hlsl::mpl::rotr::value; - - // SHADER_CRASHING_ASSERT(rotlResult0 == mplRotlResult0); - // SHADER_CRASHING_ASSERT(rotlResult1 == mplRotlResult1); - // SHADER_CRASHING_ASSERT(rotrResult0 == mplRotrResult0); - // SHADER_CRASHING_ASSERT(rotrResult1 == mplRotrResult1); - - // TODO: more tests and compare with cpp version as well - fill(invocationID, 5); - // countl_zero test - { - static const uint16_t TEST_VALUE_0 = 5; - static const uint32_t TEST_VALUE_1 = 0x80000000u; - static const uint32_t TEST_VALUE_2 = 0x8000000000000000u; - static const uint32_t TEST_VALUE_3 = 0x00000001u; - static const uint32_t TEST_VALUE_4 = 0x0000000000000001u; - - - fill(invocationID, 5.01); - uint16_t compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - uint16_t runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_0); - fill(invocationID, float4(5.1, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_1); - fill(invocationID, float4(5.2, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_2); - fill(invocationID, float4(5.3, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_3); - fill(invocationID, float4(5.4, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_4); - fill(invocationID, float4(5.5, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - } - - { - bool A = Spec<3>::value == 3; - } - { - bool A = nbl::hlsl::is_integral::value; - } - { - bool A = val(unbounded); - } - fill(invocationID, 6); - { - float4 v; - fill(invocationID, float4(alignof(v.x), alignof(v), 0, 0)); - SHADER_CRASHING_ASSERT(alignof(v.x) == alignof(v)); - } - - { - float4 v; - const volatile float4 u; - - STATIC_ASSERT(IS_SAME(decltype(v.x), nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(nbl::hlsl::impl::is_reference::value); - STATIC_ASSERT(IS_SAME(float,nbl::hlsl::impl::remove_reference::type)); - STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference::type>::type)); - - STATIC_ASSERT(IS_SAME(float,nbl::hlsl::remove_cvref::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv::type,nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv::type,nbl::hlsl::impl::add_lvalue_reference::type>::type)); - } - fill(invocationID, 7); - { - float x[4][4]; - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_extent::type, float[4])); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_all_extents::type, float)); - } - fill(invocationID, 8); - { - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - } - - { - int Q[3][4][5]; - STATIC_ASSERT(3 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(4 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(5 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(0 == (nbl::hlsl::extent::value)); - } - fill(invocationID, 9); - { - float32_t a; - - a = nbl::hlsl::numeric_limits::min; - a = nbl::hlsl::numeric_limits::max; - a = nbl::hlsl::numeric_limits::lowest; - a = nbl::hlsl::numeric_limits::epsilon; - a = nbl::hlsl::numeric_limits::round_error; - a = nbl::hlsl::numeric_limits::infinity; - a = nbl::hlsl::numeric_limits::quiet_NaN; - a = nbl::hlsl::numeric_limits::signaling_NaN; - a = nbl::hlsl::numeric_limits::denorm_min; - } - - fill(invocationID, 10); - - fill(invocationID, -1); -} diff --git a/63_CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt index 2f8f8439f..bc1624875 100644 --- a/63_CUDAInterop/CMakeLists.txt +++ b/63_CUDAInterop/CMakeLists.txt @@ -1,17 +1,24 @@ - include(common RESULT_VARIABLE RES) if(NOT RES) message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -set(CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} -) +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") -nbl_create_executable_project( - "" - "" - "${CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS}" - "" - "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}" -) \ No newline at end of file + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/63_CUDAInterop/vectorAdd_kernel.cu b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu similarity index 99% rename from 63_CUDAInterop/vectorAdd_kernel.cu rename to 63_CUDAInterop/app_resources/vectorAdd_kernel.cu index 3baef0123..99c831121 100644 --- a/63_CUDAInterop/vectorAdd_kernel.cu +++ b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu @@ -39,4 +39,4 @@ extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, if (i < numElements) { C[i] = A[i] + B[i]; } -} \ No newline at end of file +} diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index 28c6d6f2f..af1ec82cf 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -2,14 +2,12 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#define _NBL_STATIC_LIB_ -#include - #include "nbl/video/CCUDAHandler.h" #include "nbl/video/CCUDASharedMemory.h" #include "nbl/video/CCUDASharedSemaphore.h" #include "../common/MonoDeviceApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" using namespace nbl; using namespace core; @@ -22,77 +20,115 @@ The start of the main function starts like in most other example. We ask the user for the desired renderer and start it up. */ -#define ASSERT_SUCCESS(expr) \ -if (auto re = expr; CUDA_SUCCESS != re) { \ - const char* name = 0, *str = 0; \ - cu.pcuGetErrorName(re, &name); \ - cu.pcuGetErrorString(re, &str); \ - printf("%s:%d %s:\n\t%s\n", __FILE__, __LINE__, name, str); \ - abort(); \ +bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line) +{ + if (auto re = err; CUDA_SUCCESS != re) + { + const char* name = 0, * str = 0; + cu.pcuGetErrorName(re, &name); + cu.pcuGetErrorString(re, &str); + logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); + return false; + } + return true; } -#define ASSERT_SUCCESS_NV(expr) \ -if (auto re = expr; NVRTC_SUCCESS != re) { \ - const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); \ - printf("%s:%d %s\n", __FILE__, __LINE__, str); \ - abort(); \ +bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) +{ + if (auto re = err; NVRTC_SUCCESS != re) + { + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); + return false; + } + return true; } -constexpr uint32_t gridDim[3] = { 4096,1,1 }; -constexpr uint32_t blockDim[3] = { 1024,1,1 }; -size_t numElements = gridDim[0] * blockDim[0]; -size_t size = sizeof(float) * numElements; +#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); } +#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } #ifndef _NBL_COMPILE_WITH_CUDA_ static_assert(false); #endif -class CUDA2VKApp : public examples::MonoDeviceApplication +class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication { - using base_t = examples::MonoDeviceApplication; + using device_base_t = examples::MonoDeviceApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + + static constexpr uint32_t gridDim[3] = { 4096,1,1 }; + static constexpr uint32_t blockDim[3] = { 1024,1,1 }; + static constexpr size_t numElements = gridDim[0] * blockDim[0]; + static constexpr size_t size = sizeof(float) * numElements; public: - // Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor - using base_t::base_t; + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} smart_refctd_ptr cudaHandler; smart_refctd_ptr cudaDevice; IQueue* queue; - std::array, 2> cpubuffers; - std::array, 3> mem = {}; - smart_refctd_ptr cusema; + // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory + std::array, 2> cpuBufs; + // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' + // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side + std::array, 3> cudaMemories = {}; + // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer + smart_refctd_ptr cudaSemaphore; - smart_refctd_ptr importedbuf, stagingbuf, stagingbuf2; - smart_refctd_ptr importedimg; - smart_refctd_ptr sema; - smart_refctd_ptr commandPool; - smart_refctd_ptr cmd; + // our Buffer that is bound to cudaMemories[2] + smart_refctd_ptr importedBuf; + // our Image that is also bound to cudaMemories[2] + smart_refctd_ptr importedImg; + // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing + smart_refctd_ptr stagingBufs[2]; + + // Nabla semaphore that aliases the cudaSemaphore above + smart_refctd_ptr importedSemaphore; + + smart_refctd_ptr commandPool; + smart_refctd_ptr cmd[2]; bool onAppInitialized(smart_refctd_ptr&& system) override { - // Remember to call the base class initialization! - if (!base_t::onAppInitialized(std::move(system))) - return false; - + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + { - cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr(m_logger)); - assert(cudaHandler); + cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); + if (!cudaHandler) return logFail("Could not create a CUDA handler!"); cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); + if (!cudaDevice) return logFail("Could not create a CUDA Device!"); } - queue = base_t::getComputeQueue(); + queue = device_base_t::getComputeQueue(); createResources(); smart_refctd_ptr ptx; { - ISystem::future_t> fut; - m_system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ); - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions()); - ASSERT_SUCCESS_NV(res); + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + // this time we load a shader directly from a file + auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load kernel!"); + + smart_refctd_ptr source = IAsset::castDown(assets[0]); + std::string log; + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); + ASSERT_SUCCESS_NV(res, log); + ptx = std::move(ptx_); } CUmodule module; @@ -119,37 +155,46 @@ class CUDA2VKApp : public examples::MonoDeviceApplication { auto& cu = cudaHandler->getCUDAFunctionTable(); - for (auto& buf : cpubuffers) + for (auto& buf : cpuBufs) buf = make_smart_refctd_ptr(size); for (auto j = 0; j < 2; j++) for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX); - + reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - sema = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); - ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get())); + importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get())); { - auto devmemory = mem[2]->exportAsMemory(m_device.get()); - assert(devmemory); + // export the CUmem we have just created into a refctd IDeviceMemoryAllocation + auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + if (!devmemory) + logFail("Failed to export CUDA memory!"); + + + // create an importing external buffer on Nabla side IGPUBuffer::SCreationParams params = {}; params.size = devmemory->getAllocationSize(); params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - importedbuf = m_device->createBuffer(std::move(params)); - assert(importedbuf); - ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } }; + importedBuf = m_device->createBuffer(std::move(params)); + if (!importedBuf) logFail("Failed to create an external buffer"); + + // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; bool re = m_device->bindBufferMemory(1, &bindInfo); - assert(re); + if (!re) logFail("Failed to bind CUDA memory to buffer"); } { - + // same thing as above + // we create an external image and bind the imported external memory to it + // now we have 2 different resources that are bound to the same memory IImage::SCreationParams params = {}; params.type = IGPUImage::ET_2D; params.samples = IGPUImage::ESCF_1_BIT; @@ -158,48 +203,89 @@ class CUDA2VKApp : public examples::MonoDeviceApplication params.mipLevels = 1; params.arrayLayers = 1; params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; - importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params)); - assert(importedimg); + importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); + if (!importedImg) logFail("Failed to create an external image"); } - commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {}); - bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger)); - assert(re); + commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); - auto createStaging = [logicalDevice= m_device]() + auto createStaging = [this,logicalDevice= m_device]() { auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); auto req = buf->getMemoryReqs(); req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); auto allocation = logicalDevice->allocate(req, buf.get()); - assert(allocation.isValid() && buf->getBoundMemory().memory->isMappable()); - bool re = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); - assert(re && allocation.memory->getMappedPointer()); - memset(allocation.memory->getMappedPointer(), 0, req.size); + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); return buf; }; - stagingbuf = createStaging(); - stagingbuf2 = createStaging(); + stagingBufs[0] = createStaging(); + stagingBufs[1] = createStaging(); } void launchKernel(CUfunction kernel, CUstream stream) { + + // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = {.buffer = importedBuf, }, + }; + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + } + }; + // start recording + bool re = true; + re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[0]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong readying resources for CUDA"); + } + auto& cu = cudaHandler->getCUDAFunctionTable(); // Launch kernel { CUdeviceptr ptrs[] = { - mem[0]->getDeviceptr(), - mem[1]->getDeviceptr(), - mem[2]->getDeviceptr(), + cudaMemories[0]->getDeviceptr(), + cudaMemories[1]->getDeviceptr(), + cudaMemories[2]->getDeviceptr(), }; - void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream)); + auto numEles = numElements; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + + auto semaphore = cudaSemaphore->getInternalObject(); + CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } }; - auto semaphore = cusema->getInternalObject(); + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore } @@ -208,65 +294,66 @@ class CUDA2VKApp : public examples::MonoDeviceApplication IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, }, .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = queue->getFamilyIndex(), + .otherQueueFamilyIndex = IQueue::FamilyExternal, }, - .range = { .buffer = importedbuf, }, + .range = { .buffer = importedBuf, }, }; - bool re = true; - re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} }); + re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); IGPUCommandBuffer::SBufferCopy region = { .size = size }; - re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, ®ion); + re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { .barrier = { .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, }, .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = queue->getFamilyIndex(), + .otherQueueFamilyIndex = IQueue::FamilyExternal, }, - .image = importedimg.get(), + .image = importedImg.get(), .subresourceRange = { .aspectMask = IImage::EAF_COLOR_BIT, .levelCount = 1u, .layerCount = 1u, }, - .oldLayout = IImage::LAYOUT::UNDEFINED, + .oldLayout = IImage::LAYOUT::PREINITIALIZED, .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, }; - re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); IImage::SBufferCopy imgRegion = { .imageSubresource = { .aspectMask = imgBarrier.subresourceRange.aspectMask, .layerCount = imgBarrier.subresourceRange.layerCount, }, - .imageExtent = importedimg->getCreationParameters().extent, + .imageExtent = importedImg->getCreationParameters().extent, }; - re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion); - re &= cmd->end(); + re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); + re &= cmd[1]->end(); - auto waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}; - auto signalSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }}; - auto commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}}; - auto submitInfo = std::array{IQueue::SSubmitInfo { - .waitSemaphores = waitSemaphores, - .commandBuffers = commandBuffers, - .signalSemaphores = signalSemaphores, - }}; - auto submitRe = queue->submit(submitInfo); + IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {&waitInfo,&waitInfo + 1}, + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo,&signalInfo + 1} + }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); re &= IQueue::RESULT::SUCCESS == submitRe; - assert(re); + if (!re) + logFail("Something went wrong copying results from CUDA"); } ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); @@ -275,20 +362,23 @@ class CUDA2VKApp : public examples::MonoDeviceApplication void kernelCallback() { // Make sure we are also done with the readback - auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}}; + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}}; m_device->waitForSemaphores(wait, true, -1); - float* A = reinterpret_cast(cpubuffers[0]->getPointer()); - float* B = reinterpret_cast(cpubuffers[1]->getPointer()); - float* CBuf = reinterpret_cast(stagingbuf->getBoundMemory().memory->getMappedPointer()); - float* CImg = reinterpret_cast(stagingbuf2->getBoundMemory().memory->getMappedPointer()); + float* A = reinterpret_cast(cpuBufs[0]->getPointer()); + float* B = reinterpret_cast(cpuBufs[1]->getPointer()); + + float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); + float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); - assert(!memcmp(CBuf, CImg, size)); + if(memcmp(CBuf, CImg, size)) + logFail("Buffer and Image memories do not match!"); for (auto i = 0; i < numElements; i++) { bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); - assert(re); + if(!re) + logFail("Element at index %d is incorrect!", i); } std::cout << "Success\n"; From 59ebd29b4eaf8ce530fd75b0e17478e59b7310d2 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Fri, 19 Jan 2024 01:11:33 +0300 Subject: [PATCH 16/17] filter cuda compatible devices --- 63_CUDAInterop/main.cpp | 701 ++++++++++++++++++++-------------------- 1 file changed, 357 insertions(+), 344 deletions(-) diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index af1ec82cf..1d67dbee3 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -22,26 +22,26 @@ user for the desired renderer and start it up. bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line) { - if (auto re = err; CUDA_SUCCESS != re) - { - const char* name = 0, * str = 0; - cu.pcuGetErrorName(re, &name); - cu.pcuGetErrorString(re, &str); - logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); - return false; - } - return true; + if (auto re = err; CUDA_SUCCESS != re) + { + const char* name = 0, * str = 0; + cu.pcuGetErrorName(re, &name); + cu.pcuGetErrorString(re, &str); + logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); + return false; + } + return true; } bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) { - if (auto re = err; NVRTC_SUCCESS != re) - { - const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); - logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); - return false; - } - return true; + if (auto re = err; NVRTC_SUCCESS != re) + { + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); + return false; + } + return true; } #define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); } @@ -53,342 +53,355 @@ static_assert(false); class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication { - using device_base_t = examples::MonoDeviceApplication; - using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + using device_base_t = examples::MonoDeviceApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; - static constexpr uint32_t gridDim[3] = { 4096,1,1 }; - static constexpr uint32_t blockDim[3] = { 1024,1,1 }; - static constexpr size_t numElements = gridDim[0] * blockDim[0]; - static constexpr size_t size = sizeof(float) * numElements; + static constexpr uint32_t gridDim[3] = { 4096,1,1 }; + static constexpr uint32_t blockDim[3] = { 1024,1,1 }; + static constexpr size_t numElements = gridDim[0] * blockDim[0]; + static constexpr size_t size = sizeof(float) * numElements; public: - // Yay thanks to multiple inheritance we cannot forward ctors anymore - CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - smart_refctd_ptr cudaHandler; - smart_refctd_ptr cudaDevice; - - IQueue* queue; - - // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory - std::array, 2> cpuBufs; - // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' - // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side - std::array, 3> cudaMemories = {}; - // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer - smart_refctd_ptr cudaSemaphore; - - // our Buffer that is bound to cudaMemories[2] - smart_refctd_ptr importedBuf; - // our Image that is also bound to cudaMemories[2] - smart_refctd_ptr importedImg; + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + smart_refctd_ptr cudaHandler; + smart_refctd_ptr cudaDevice; + + IQueue* queue; + + // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory + std::array, 2> cpuBufs; + // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' + // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side + std::array, 3> cudaMemories = {}; + // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer + smart_refctd_ptr cudaSemaphore; + + // our Buffer that is bound to cudaMemories[2] + smart_refctd_ptr importedBuf; + // our Image that is also bound to cudaMemories[2] + smart_refctd_ptr importedImg; + + // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing + smart_refctd_ptr stagingBufs[2]; + + // Nabla semaphore that aliases the cudaSemaphore above + smart_refctd_ptr importedSemaphore; + + smart_refctd_ptr commandPool; + smart_refctd_ptr cmd[2]; + + // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc. + core::set filterDevices(const core::SRange& physicalDevices) const override + { + auto devices = device_base_t::filterDevices(physicalDevices); + auto& cuDevices = cudaHandler->getAvailableDevices(); + std::erase_if(devices, [&cuDevices](auto pdev) { + return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16); }); + }); + return devices; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; - // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing - smart_refctd_ptr stagingBufs[2]; + cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); + if (!cudaHandler) + return logFail("Could not create a CUDA handler!"); - // Nabla semaphore that aliases the cudaSemaphore above - smart_refctd_ptr importedSemaphore; + if (!device_base_t::onAppInitialized(std::move(system))) + return false; - smart_refctd_ptr commandPool; - smart_refctd_ptr cmd[2]; + cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); + if (!cudaDevice) + return logFail("Could not create a CUDA Device!"); - bool onAppInitialized(smart_refctd_ptr&& system) override - { - // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - { - cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); - if (!cudaHandler) return logFail("Could not create a CUDA handler!"); - cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); - if (!cudaDevice) return logFail("Could not create a CUDA Device!"); - } - - - queue = device_base_t::getComputeQueue(); - - createResources(); - - smart_refctd_ptr ptx; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - // this time we load a shader directly from a file - auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - return logFail("Could not load kernel!"); - - smart_refctd_ptr source = IAsset::castDown(assets[0]); - std::string log; - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), - "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); - ASSERT_SUCCESS_NV(res, log); - - ptx = std::move(ptx_); - } - CUmodule module; - CUfunction kernel; - CUstream stream; - - auto& cu = cudaHandler->getCUDAFunctionTable(); - - ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); - ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); - ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - - launchKernel(kernel, stream); - - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); - ASSERT_SUCCESS(cu.pcuModuleUnload(module)); - ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - - m_device->waitIdle(); - return true; - } - - void createResources() - { - auto& cu = cudaHandler->getCUDAFunctionTable(); - - for (auto& buf : cpuBufs) - buf = make_smart_refctd_ptr(size); - - for (auto j = 0; j < 2; j++) - for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); - - - // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - - importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); - ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get())); - { - // export the CUmem we have just created into a refctd IDeviceMemoryAllocation - auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); - if (!devmemory) - logFail("Failed to export CUDA memory!"); - - - // create an importing external buffer on Nabla side - IGPUBuffer::SCreationParams params = {}; - params.size = devmemory->getAllocationSize(); - params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - importedBuf = m_device->createBuffer(std::move(params)); - if (!importedBuf) logFail("Failed to create an external buffer"); - - // bind that imported IDeviceMemoryAllocation to the external buffer we've just created - ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; - bool re = m_device->bindBufferMemory(1, &bindInfo); - if (!re) logFail("Failed to bind CUDA memory to buffer"); - } - - { - // same thing as above - // we create an external image and bind the imported external memory to it - // now we have 2 different resources that are bound to the same memory - IImage::SCreationParams params = {}; - params.type = IGPUImage::ET_2D; - params.samples = IGPUImage::ESCF_1_BIT; - params.format = EF_R32_SFLOAT; - params.extent = { gridDim[0], blockDim[0], 1 }; - params.mipLevels = 1; - params.arrayLayers = 1; - params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; - importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); - if (!importedImg) logFail("Failed to create an external image"); - } - - commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); - - auto createStaging = [this,logicalDevice= m_device]() - { - auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); - auto req = buf->getMemoryReqs(); - req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - auto allocation = logicalDevice->allocate(req, buf.get()); - - void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); - if (!mapping) - logFail("Failed to map an staging buffer"); - memset(mapping, 0, req.size); - return buf; - }; - - stagingBufs[0] = createStaging(); - stagingBufs[1] = createStaging(); - } - - void launchKernel(CUfunction kernel, CUstream stream) - { - - // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API - { - IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .range = {.buffer = importedBuf, }, - }; - - IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - .barrier = { - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .image = importedImg.get(), - .subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .levelCount = 1u, - .layerCount = 1u, - } - }; - // start recording - bool re = true; - re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); - re &= cmd[0]->end(); - - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 }; - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; - IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; - auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); - re &= IQueue::RESULT::SUCCESS == submitRe; - if (!re) - logFail("Something went wrong readying resources for CUDA"); - } - - auto& cu = cudaHandler->getCUDAFunctionTable(); - // Launch kernel - { - CUdeviceptr ptrs[] = { - cudaMemories[0]->getDeviceptr(), - cudaMemories[1]->getDeviceptr(), - cudaMemories[2]->getDeviceptr(), - }; - auto numEles = numElements; - void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); - - auto semaphore = cudaSemaphore->getInternalObject(); - CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore - } - - // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA - { - IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { - .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .range = { .buffer = importedBuf, }, - }; - bool re = true; - re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); - - IGPUCommandBuffer::SBufferCopy region = { .size = size }; - re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); - - IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - .barrier = { - .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .image = importedImg.get(), - .subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .levelCount = 1u, - .layerCount = 1u, - }, - .oldLayout = IImage::LAYOUT::PREINITIALIZED, - .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, - }; - - re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); - - IImage::SBufferCopy imgRegion = { - .imageSubresource = { - .aspectMask = imgBarrier.subresourceRange.aspectMask, - .layerCount = imgBarrier.subresourceRange.layerCount, - }, - .imageExtent = importedImg->getCreationParameters().extent, - }; - - re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); - re &= cmd[1]->end(); - - IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 }; - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 }; - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; - IQueue::SSubmitInfo submitInfo = { - .waitSemaphores = {&waitInfo,&waitInfo + 1}, - .commandBuffers = {&cmdInfo, &cmdInfo + 1}, - .signalSemaphores = {&signalInfo,&signalInfo + 1} - }; - auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); - re &= IQueue::RESULT::SUCCESS == submitRe; - if (!re) - logFail("Something went wrong copying results from CUDA"); - } + queue = device_base_t::getComputeQueue(); + + createResources(); + + smart_refctd_ptr ptx; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + // this time we load a shader directly from a file + auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load kernel!"); + + smart_refctd_ptr source = IAsset::castDown(assets[0]); + std::string log; + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); + ASSERT_SUCCESS_NV(res, log); + + ptx = std::move(ptx_); + } + CUmodule module; + CUfunction kernel; + CUstream stream; + + auto& cu = cudaHandler->getCUDAFunctionTable(); + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + launchKernel(kernel, stream); + + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + + m_device->waitIdle(); + return true; + } + + void createResources() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); + + for (auto& buf : cpuBufs) + buf = make_smart_refctd_ptr(size); + + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); + + + // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + + importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get())); + { + // export the CUmem we have just created into a refctd IDeviceMemoryAllocation + auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + if (!devmemory) + logFail("Failed to export CUDA memory!"); + + + // create an importing external buffer on Nabla side + IGPUBuffer::SCreationParams params = {}; + params.size = devmemory->getAllocationSize(); + params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + importedBuf = m_device->createBuffer(std::move(params)); + if (!importedBuf) logFail("Failed to create an external buffer"); + + // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; + bool re = m_device->bindBufferMemory(1, &bindInfo); + if (!re) logFail("Failed to bind CUDA memory to buffer"); + } + + { + // same thing as above + // we create an external image and bind the imported external memory to it + // now we have 2 different resources that are bound to the same memory + IImage::SCreationParams params = {}; + params.type = IGPUImage::ET_2D; + params.samples = IGPUImage::ESCF_1_BIT; + params.format = EF_R32_SFLOAT; + params.extent = { gridDim[0], blockDim[0], 1 }; + params.mipLevels = 1; + params.arrayLayers = 1; + params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; + importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); + if (!importedImg) logFail("Failed to create an external image"); + } + + commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); + + auto createStaging = [this,logicalDevice= m_device]() + { + auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = logicalDevice->allocate(req, buf.get()); + + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); + return buf; + }; + + stagingBufs[0] = createStaging(); + stagingBufs[1] = createStaging(); + } + + void launchKernel(CUfunction kernel, CUstream stream) + { + + // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = {.buffer = importedBuf, }, + }; + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + } + }; + // start recording + bool re = true; + re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[0]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong readying resources for CUDA"); + } + + auto& cu = cudaHandler->getCUDAFunctionTable(); + // Launch kernel + { + CUdeviceptr ptrs[] = { + cudaMemories[0]->getDeviceptr(), + cudaMemories[1]->getDeviceptr(), + cudaMemories[2]->getDeviceptr(), + }; + auto numEles = numElements; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + + auto semaphore = cudaSemaphore->getInternalObject(); + CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + } - ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); - } - - void kernelCallback() - { - // Make sure we are also done with the readback - auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}}; - m_device->waitForSemaphores(wait, true, -1); - - float* A = reinterpret_cast(cpuBufs[0]->getPointer()); - float* B = reinterpret_cast(cpuBufs[1]->getPointer()); - - float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); - float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); - - if(memcmp(CBuf, CImg, size)) - logFail("Buffer and Image memories do not match!"); - - for (auto i = 0; i < numElements; i++) - { - bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); - if(!re) - logFail("Element at index %d is incorrect!", i); - } - - std::cout << "Success\n"; - } - - // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. - bool keepRunning() override { return false; } - - // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" - void workLoopBody() override {} + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { .buffer = importedBuf, }, + }; + bool re = true; + re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); + + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + }, + .oldLayout = IImage::LAYOUT::PREINITIALIZED, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + }; + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); + + IImage::SBufferCopy imgRegion = { + .imageSubresource = { + .aspectMask = imgBarrier.subresourceRange.aspectMask, + .layerCount = imgBarrier.subresourceRange.layerCount, + }, + .imageExtent = importedImg->getCreationParameters().extent, + }; + + re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); + re &= cmd[1]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {&waitInfo,&waitInfo + 1}, + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo,&signalInfo + 1} + }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong copying results from CUDA"); + } + + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); + } + + void kernelCallback() + { + // Make sure we are also done with the readback + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}}; + m_device->waitForSemaphores(wait, true, -1); + + float* A = reinterpret_cast(cpuBufs[0]->getPointer()); + float* B = reinterpret_cast(cpuBufs[1]->getPointer()); + + float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); + float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); + + if(memcmp(CBuf, CImg, size)) + logFail("Buffer and Image memories do not match!"); + + for (auto i = 0; i < numElements; i++) + { + bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); + if(!re) + logFail("Element at index %d is incorrect!", i); + } + + std::cout << "Success\n"; + } + + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. + bool keepRunning() override { return false; } + + // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" + void workLoopBody() override {} }; NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file From 73f147941ef5362d0adee47ae72b4088b8c49aa5 Mon Sep 17 00:00:00 2001 From: atkurtul Date: Sat, 20 Jan 2024 01:24:13 +0300 Subject: [PATCH 17/17] fix some comments & add test code --- 63_CUDAInterop/main.cpp | 179 ++++++++++++++++++++++++++++++++++------ 1 file changed, 155 insertions(+), 24 deletions(-) diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp index 1d67dbee3..9508fa22a 100644 --- a/63_CUDAInterop/main.cpp +++ b/63_CUDAInterop/main.cpp @@ -75,7 +75,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side std::array, 3> cudaMemories = {}; - // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer + // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer smart_refctd_ptr cudaSemaphore; // our Buffer that is bound to cudaMemories[2] @@ -86,8 +86,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing smart_refctd_ptr stagingBufs[2]; - // Nabla semaphore that aliases the cudaSemaphore above - smart_refctd_ptr importedSemaphore; + // Nabla semaphore for sync + smart_refctd_ptr semaphore; smart_refctd_ptr commandPool; smart_refctd_ptr cmd[2]; @@ -161,6 +161,9 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); m_device->waitIdle(); + + testInterop(); + return true; } @@ -181,8 +184,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); - ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get())); + semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); { // export the CUmem we have just created into a refctd IDeviceMemoryAllocation auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); @@ -196,7 +199,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; importedBuf = m_device->createBuffer(std::move(params)); - if (!importedBuf) logFail("Failed to create an external buffer"); + if (!importedBuf) + logFail("Failed to create an external buffer"); // bind that imported IDeviceMemoryAllocation to the external buffer we've just created ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; @@ -223,24 +227,36 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); - auto createStaging = [this,logicalDevice= m_device]() - { - auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} }); - auto req = buf->getMemoryReqs(); - req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - auto allocation = logicalDevice->allocate(req, buf.get()); - - void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); - if (!mapping) - logFail("Failed to map an staging buffer"); - memset(mapping, 0, req.size); - return buf; - }; - stagingBufs[0] = createStaging(); stagingBufs[1] = createStaging(); } + smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) + { + IGPUBuffer::SCreationParams params = {}; + params.size = mem->getAllocationSize(); + params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + params.externalHandleTypes = mem->getCreationParams().externalHandleType; + auto buf = m_device->createBuffer(std::move(params)); + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; + m_device->bindBufferMemory(1, &bindInfo); + return buf; + } + + smart_refctd_ptr createStaging(size_t sz = size) + { + auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = m_device->allocate(req, buf.get()); + + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); + return buf; + }; + void launchKernel(CUfunction kernel, CUstream stream) { @@ -272,7 +288,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); re &= cmd[0]->end(); - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); @@ -355,8 +371,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); re &= cmd[1]->end(); - IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 }; - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 }; + IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; IQueue::SSubmitInfo submitInfo = { .waitSemaphores = {&waitInfo,&waitInfo + 1}, @@ -375,7 +391,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono void kernelCallback() { // Make sure we are also done with the readback - auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}}; + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}}; m_device->waitForSemaphores(wait, true, -1); float* A = reinterpret_cast(cpuBufs[0]->getPointer()); @@ -397,6 +413,121 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono std::cout << "Success\n"; } + + void testInterop() + { + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + for (size_t i = 0; i < (1 << 8); ++i) + { + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + assert(memory); + auto tmpBuf = createExternalBuffer(memory.get()); + } + } + + smart_refctd_ptr escaped; + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + auto tmpBuf = createExternalBuffer(memory.get()); + auto staging = createStaging(); + + auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + ptr[i] = i; + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + queue->submit({ &submitInfo,&submitInfo + 1 }); + m_device->waitIdle(); + escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; + } + + //{ + // constexpr size_t M = 32; + // auto staging = createStaging(size * M); + + // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < (M * size) / 4; ++i) + // ptr[i] = rand(); + + // std::vector> cmd(1 << 10); + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + + // for (size_t i = 0; i < 1 << 10; ++i) + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size * M, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // RE: + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + // if (!memory) + // { + // m_device->waitIdle(); + // for (size_t j = 0; j < i; ++j) + // cmd[j] = 0; + // goto END; + // } + // assert(memory); + // auto tmpBuf = createExternalBuffer(memory.get()); + + // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // cmd[i]->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + // } + //END: + // m_device->waitIdle(); + //} + + { + auto tmpBuf = createExternalBuffer(escaped.get()); + auto staging = createStaging(); + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); + assert(IQueue::RESULT::SUCCESS == qre); + m_device->waitIdle(); + + auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + assert(ptr[i] == i); + } + + } + + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. bool keepRunning() override { return false; }