From 3a021340195a3e77c4855afb1b0ccef23c3c503b Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 16:09:11 +0300
Subject: [PATCH 01/17] working cuda vulkan interop example

---
 63.CUDAInterop/CMakeLists.txt      |  17 ++++
 63.CUDAInterop/main.cpp            | 131 +++++++++++++++++++++++++++++
 63.CUDAInterop/pipeline.groovy     |  50 +++++++++++
 63.CUDAInterop/vectorAdd_kernel.cu |  42 +++++++++
 CMakeLists.txt                     |   1 +
 5 files changed, 241 insertions(+)
 create mode 100644 63.CUDAInterop/CMakeLists.txt
 create mode 100644 63.CUDAInterop/main.cpp
 create mode 100644 63.CUDAInterop/pipeline.groovy
 create mode 100644 63.CUDAInterop/vectorAdd_kernel.cu

diff --git a/63.CUDAInterop/CMakeLists.txt b/63.CUDAInterop/CMakeLists.txt
new file mode 100644
index 000000000..2f8f8439f
--- /dev/null
+++ b/63.CUDAInterop/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+set(CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS
+	${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+)
+
+nbl_create_executable_project(
+	""
+	""
+	"${CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS}"
+	""
+	"${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}"
+)
\ No newline at end of file
diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
new file mode 100644
index 000000000..a1c38629b
--- /dev/null
+++ b/63.CUDAInterop/main.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#define _NBL_STATIC_LIB_
+#include <nabla.h>
+
+#include "nbl/video/CCUDAHandler.h"
+
+#include "../common/CommonAPI.h"
+
+/**
+This example just shows a screen which clears to red,
+nothing fancy, just to show that Irrlicht links fine
+**/
+using namespace nbl;
+
+
+/*
+The start of the main function starts like in most other example. We ask the
+user for the desired renderer and start it up.
+*/
+
+#define ASSERT_SUCCESS(expr) \
+if (auto re = expr; CUDA_SUCCESS != re) { \
+	const char* name = 0, *str = 0; \
+	cu.pcuGetErrorName(re, &name); \
+	cu.pcuGetErrorString(re, &str); \
+	printf("%s:%d %s:\n\t%s\n", __FILE__, __LINE__, name, str); \
+	abort(); \
+}
+
+#define ASSERT_SUCCESS_NV(expr) \
+if (auto re = expr; NVRTC_SUCCESS != re) { \
+	const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); \
+	printf("%s:%d %s\n", __FILE__, __LINE__, str); \
+	abort(); \
+}
+
+
+int main(int argc, char** argv)
+{
+	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
+		.appName = { "63.CUDAInterop" },
+		.apiType = video::EAT_VULKAN, 
+		.swapchainImageUsage = nbl::asset::IImage::EUF_NONE,
+	});
+
+	auto& system = initOutput.system;
+	auto& apiConnection = initOutput.apiConnection;
+	auto& physicalDevice = initOutput.physicalDevice;
+	auto& logicalDevice = initOutput.logicalDevice;
+	auto& utilities = initOutput.utilities;
+	auto& queues = initOutput.queues;
+	auto& logger = initOutput.logger;
+		
+	assert(physicalDevice->getLimits().externalMemory);
+	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
+	assert(cudaHandler);
+	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
+	
+	auto& cu = cudaHandler->getCUDAFunctionTable();
+	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+	CUmodule module;
+	CUfunction kernel;
+	CUstream stream;
+
+	{
+		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> fut;
+		system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ);
+		auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
+		ASSERT_SUCCESS_NV(res);
+		ptx = std::move(ptx_);
+	}
+	
+	ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+	
+	constexpr uint32_t gridDim[3]  = { 4096,1,1 };
+	constexpr uint32_t blockDim[3] = { 1024,1,1 };
+	int numElements = gridDim[0] * blockDim[0];
+	auto _size = sizeof(float) * numElements;
+
+	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size) };
+	for (auto j = 0; j < 2; j++)
+		for (auto i = 0; i < numElements; i++)
+			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
+	
+	auto createBuffer = [&](int idx) {
+
+		CUexternalMemory mem = 0;
+		CUdeviceptr ptr = 0;
+
+		auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], 
+			{{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
+			{{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, 
+			cpubuffers[idx]->getPointer());
+
+		assert(buf);
+		CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
+			.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
+			.handle = {.win32 = {.handle = buf->getExternalHandle()}},
+			.size = buf->getMemoryReqs().size,
+		};
+		CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size };
+		ASSERT_SUCCESS(cu.pcuImportExternalMemory(&mem, &handleDesc));
+		ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(&ptr, mem, &bufferDesc));
+		return std::tuple<decltype(buf), CUexternalMemory, CUdeviceptr>{buf, mem, ptr};
+	};
+	
+	auto [buf0, mem0, ptr0] = createBuffer(0);
+	auto [buf1, mem1, ptr1] = createBuffer(1);
+	auto [buf2, mem2, ptr2] = createBuffer(2);
+
+	void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
+    
+	ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+	ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream));
+	ASSERT_SUCCESS(cu.pcuCtxSynchronize());
+
+	float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
+	float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
+	float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
+	for (auto i = 0; i < numElements; i++)
+		assert(abs(C[i] - A[i] - B[i]) < 0.01f);
+
+	return 0;
+}
diff --git a/63.CUDAInterop/pipeline.groovy b/63.CUDAInterop/pipeline.groovy
new file mode 100644
index 000000000..29909e3d1
--- /dev/null
+++ b/63.CUDAInterop/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CCUDAInteropBuilder extends IBuilder
+{
+	public CCUDAInteropBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CCUDAInteropBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/63.CUDAInterop/vectorAdd_kernel.cu b/63.CUDAInterop/vectorAdd_kernel.cu
new file mode 100644
index 000000000..3baef0123
--- /dev/null
+++ b/63.CUDAInterop/vectorAdd_kernel.cu
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+                                     int numElements) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < numElements) {
+    C[i] = A[i] + B[i];
+  }
+}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 832f1eebd..dee7c30aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,7 @@ add_subdirectory(61.BlitFilterTest EXCLUDE_FROM_ALL)
 add_subdirectory(61.UI EXCLUDE_FROM_ALL)
 add_subdirectory(62.CAD EXCLUDE_FROM_ALL)
 add_subdirectory(62.SchusslerTest EXCLUDE_FROM_ALL)
+add_subdirectory(63.CUDAInterop EXCLUDE_FROM_ALL)
 add_subdirectory(0.ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 
 unset(NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET CACHE)

From c90032d151f148322992a8b65aac9b39cbeda106 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 17:16:14 +0300
Subject: [PATCH 02/17] proper cleanup

---
 63.CUDAInterop/main.cpp | 92 +++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 35 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index a1c38629b..c5297830d 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -58,8 +58,8 @@ int main(int argc, char** argv)
 	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
 	assert(cudaHandler);
 	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
-	
 	auto& cu = cudaHandler->getCUDAFunctionTable();
+
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	CUmodule module;
 	CUfunction kernel;
@@ -72,7 +72,7 @@ int main(int argc, char** argv)
 		ASSERT_SUCCESS_NV(res);
 		ptx = std::move(ptx_);
 	}
-	
+
 	ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
 	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
 	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
@@ -89,43 +89,65 @@ int main(int argc, char** argv)
 		for (auto i = 0; i < numElements; i++)
 			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
 	
-	auto createBuffer = [&](int idx) {
+	{
+		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
+			struct CUCleaner : video::ICleanup
+			{
+				CUexternalMemory mem = nullptr;
+				CUdeviceptr ptr = {};
+				core::smart_refctd_ptr <video::CCUDAHandler> cudaHandler = nullptr;
+				core::smart_refctd_ptr <video::CCUDADevice> cudaDevice = nullptr;
+
+				~CUCleaner()
+				{
+					auto& cu = cudaHandler->getCUDAFunctionTable();
+					ASSERT_SUCCESS(cu.pcuMemFree_v2(ptr));
+					ASSERT_SUCCESS(cu.pcuDestroyExternalMemory(mem));
+				}
+			};
+
+			auto cleaner = std::make_unique<CUCleaner>();
+			cleaner->cudaHandler = cudaHandler;
+			cleaner->cudaDevice = cudaDevice;
+			CUexternalMemory* mem = &cleaner->mem;
+			CUdeviceptr* ptr = &cleaner->ptr;
+			auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
+				{{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
+				{{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}},
+				cpuBuf->getPointer());
+			assert(buf.get());
+			CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
+				.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
+				.handle = {.win32 = {.handle = buf->getExternalHandle()}},
+				.size = buf->getMemoryReqs().size,
+			};
+			CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size };
+			ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc));
+			ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc));
+			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUexternalMemory, CUdeviceptr>{std::move(buf), *mem, *ptr};
+		};
 
-		CUexternalMemory mem = 0;
-		CUdeviceptr ptr = 0;
+		auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]);
+		auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]);
+		auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]);
+		
+		void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
+		
+		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream));
+		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
 
-		auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE], 
-			{{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-			{{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}}, 
-			cpubuffers[idx]->getPointer());
+		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
+		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
+		float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
 
-		assert(buf);
-		CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
-			.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
-			.handle = {.win32 = {.handle = buf->getExternalHandle()}},
-			.size = buf->getMemoryReqs().size,
-		};
-		CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size };
-		ASSERT_SUCCESS(cu.pcuImportExternalMemory(&mem, &handleDesc));
-		ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(&ptr, mem, &bufferDesc));
-		return std::tuple<decltype(buf), CUexternalMemory, CUdeviceptr>{buf, mem, ptr};
-	};
+		for (auto i = 0; i < numElements; i++)
+			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
+	}
+
+	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
 	
-	auto [buf0, mem0, ptr0] = createBuffer(0);
-	auto [buf1, mem1, ptr1] = createBuffer(1);
-	auto [buf2, mem2, ptr2] = createBuffer(2);
-
-	void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
-    
-	ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-	ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream));
-	ASSERT_SUCCESS(cu.pcuCtxSynchronize());
-
-	float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
-	float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-	float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
-	for (auto i = 0; i < numElements; i++)
-		assert(abs(C[i] - A[i] - B[i]) < 0.01f);
 
 	return 0;
 }

From 6d0a5776f63c2c9dda2c40b9262c4dc962d49421 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 20:55:14 +0300
Subject: [PATCH 03/17] add cuda to vulkan example

---
 63.CUDAInterop/main.cpp | 224 ++++++++++++++++++++++++++++++----------
 1 file changed, 170 insertions(+), 54 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index c5297830d..a939bac09 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -37,58 +37,29 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \
 	abort(); \
 }
 
+constexpr uint32_t gridDim[3] = { 4096,1,1 };
+constexpr uint32_t blockDim[3] = { 1024,1,1 };
+size_t numElements = gridDim[0] * blockDim[0];
+size_t size = sizeof(float) * numElements;
 
-int main(int argc, char** argv)
+void vk2cuda(
+	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler, 
+	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice, 
+	video::IUtilities * util,
+	video::ILogicalDevice* logicalDevice,
+	nbl::video::IGPUQueue ** queues,
+	CUfunction kernel,
+	CUstream stream,
+	int=0)
 {
-	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
-		.appName = { "63.CUDAInterop" },
-		.apiType = video::EAT_VULKAN, 
-		.swapchainImageUsage = nbl::asset::IImage::EUF_NONE,
-	});
-
-	auto& system = initOutput.system;
-	auto& apiConnection = initOutput.apiConnection;
-	auto& physicalDevice = initOutput.physicalDevice;
-	auto& logicalDevice = initOutput.logicalDevice;
-	auto& utilities = initOutput.utilities;
-	auto& queues = initOutput.queues;
-	auto& logger = initOutput.logger;
-		
-	assert(physicalDevice->getLimits().externalMemory);
-	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
-	assert(cudaHandler);
-	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
 	auto& cu = cudaHandler->getCUDAFunctionTable();
-
-	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-	CUmodule module;
-	CUfunction kernel;
-	CUstream stream;
-
-	{
-		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> fut;
-		system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ);
-		auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
-		ASSERT_SUCCESS_NV(res);
-		ptx = std::move(ptx_);
-	}
-
-	ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
-	
-	constexpr uint32_t gridDim[3]  = { 4096,1,1 };
-	constexpr uint32_t blockDim[3] = { 1024,1,1 };
-	int numElements = gridDim[0] * blockDim[0];
-	auto _size = sizeof(float) * numElements;
-
-	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(_size) };
+	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size) };
 	for (auto j = 0; j < 2; j++)
 		for (auto i = 0; i < numElements; i++)
 			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
-	
+
 	{
 		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
 			struct CUCleaner : video::ICleanup
@@ -111,9 +82,9 @@ int main(int argc, char** argv)
 			cleaner->cudaDevice = cudaDevice;
 			CUexternalMemory* mem = &cleaner->mem;
 			CUdeviceptr* ptr = &cleaner->ptr;
-			auto buf = utilities->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
-				{{.size = _size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-				{{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}}},
+			auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
+				{ {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
+				{{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
 				cpuBuf->getPointer());
 			assert(buf.get());
 			CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
@@ -124,17 +95,17 @@ int main(int argc, char** argv)
 			CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size };
 			ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc));
 			ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc));
-			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUexternalMemory, CUdeviceptr>{std::move(buf), *mem, *ptr};
+			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUexternalMemory, CUdeviceptr>{std::move(buf), * mem, * ptr};
 		};
 
 		auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]);
 		auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]);
 		auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]);
-		
+
 		void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
-		
+
 		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, _size, stream));
+		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, size, stream));
 		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
@@ -144,10 +115,155 @@ int main(int argc, char** argv)
 		for (auto i = 0; i < numElements; i++)
 			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
 	}
+}
+
+void cuda2vk(
+	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler,
+	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice,
+	video::IUtilities* util,
+	video::ILogicalDevice* logicalDevice,
+	nbl::video::IGPUQueue** queues,
+	CUfunction kernel,
+	CUstream stream,
+	int = 0)
+{
+	auto& cu = cudaHandler->getCUDAFunctionTable();
+
+	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
+																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size) };
+	for (auto j = 0; j < 2; j++)
+		for (auto i = 0; i < numElements; i++)
+			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
+
+	{
+		auto createBuffer = [&]{
+			struct CUCleaner : video::ICleanup
+			{
+				CUdeviceptr ptr = {};
+				size_t sz = 0;
+				CUmemGenericAllocationHandle mem = 0;
+				core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler = nullptr;
+				core::smart_refctd_ptr<video::CCUDADevice> cudaDevice = nullptr;
+
+				~CUCleaner()
+				{
+					auto& cu = cudaHandler->getCUDAFunctionTable();
+					ASSERT_SUCCESS(cu.pcuMemUnmap(ptr, sz));
+					ASSERT_SUCCESS(cu.pcuMemAddressFree(ptr, sz));
+					ASSERT_SUCCESS(cu.pcuMemRelease(mem));
+				}
+			};
+			auto& cu = cudaHandler->getCUDAFunctionTable();
+			auto cleaner = std::make_unique<CUCleaner>();
+			cleaner->cudaHandler = cudaHandler;
+			cleaner->cudaDevice = cudaDevice;
+
+			CUdeviceptr* ptr = &cleaner->ptr;
+			CUmemGenericAllocationHandle* mem = &cleaner->mem;
+			void* handle = 0;
+			size_t* sz = &cleaner->sz;
+			size_t granularity = 0;
 
+			uint32_t metaData[16] = { 48 };
+			CUmemAllocationProp prop = {
+				.type = CU_MEM_ALLOCATION_TYPE_PINNED,
+				.requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32,
+				.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = cudaDevice->getInternalObject() },
+				.win32HandleMetaData = metaData,
+			};
+
+			ASSERT_SUCCESS(cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+			*sz = ((size - 1) / granularity + 1) * granularity;
+			ASSERT_SUCCESS(cu.pcuMemCreate(mem, *sz, &prop, 0));
+			ASSERT_SUCCESS(cu.pcuMemExportToShareableHandle(&handle, *mem, CU_MEM_HANDLE_TYPE_WIN32, 0));
+			ASSERT_SUCCESS(cu.pcuMemAddressReserve(ptr, *sz, 0, 0, 0));
+			ASSERT_SUCCESS(cu.pcuMemMap(*ptr, *sz, 0, *mem, 0));
+			CUmemAccessDesc accessDesc = {
+				.location = prop.location,
+				.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
+			};
+			ASSERT_SUCCESS(cu.pcuMemSetAccess(*ptr, *sz, &accessDesc, 1));
+
+			auto buf = logicalDevice->createBuffer(
+				{ {.size = *sz, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT },
+				{ {.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = handle}} });
+
+			assert(buf.get());
+
+			auto reqs = buf->getMemoryReqs();
+			reqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+			auto alloc = logicalDevice->allocate(reqs, buf.get());
+			assert(alloc.memory && alloc.offset != video::IDeviceMemoryAllocator::InvalidMemoryOffset);
+			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUdeviceptr>{std::move(buf), * ptr};
+		};
+
+		auto [buf0, ptr0] = createBuffer();
+		auto [buf1, ptr1] = createBuffer();
+		auto [buf2, ptr2] = createBuffer();
+
+		void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
+		
+		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr0, cpubuffers[0]->getPointer(), size, stream));
+		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr1, cpubuffers[1]->getPointer(), size, stream));
+		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
+		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
+		util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<video::IGPUBuffer>{.offset = 0, .size = size, .buffer = buf2}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]);
+
+		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
+		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
+		float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
+		
+		for (auto i = 0; i < numElements; i++)
+			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
+	}
+}
+
+int main(int argc, char** argv)
+{
+	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
+		.appName = { "63.CUDAInterop" },
+		.apiType = video::EAT_VULKAN, 
+		.swapchainImageUsage = nbl::asset::IImage::EUF_NONE,
+	});
+
+	auto& system = initOutput.system;
+	auto& apiConnection = initOutput.apiConnection;
+	auto& physicalDevice = initOutput.physicalDevice;
+	auto& logicalDevice = initOutput.logicalDevice;
+	auto& utilities = initOutput.utilities;
+	auto& queues = initOutput.queues;
+	auto& logger = initOutput.logger;
+	
+	assert(physicalDevice->getLimits().externalMemory);
+	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
+	assert(cudaHandler);
+	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
+	auto& cu = cudaHandler->getCUDAFunctionTable();
+
+	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+	CUmodule   module;
+	CUfunction kernel;
+	CUstream   stream;
+
+	{
+		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> fut;
+		system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ);
+		auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
+		ASSERT_SUCCESS_NV(res);
+		ptx = std::move(ptx_);
+	}
+
+	ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
+	cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
+	
 	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-	
 
 	return 0;
 }

From 867b4352512a56496964a17e321aaf68c5f9afb2 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 9 Jul 2023 23:34:56 +0300
Subject: [PATCH 04/17] refactor

---
 63.CUDAInterop/main.cpp | 130 ++++++++--------------------------------
 1 file changed, 25 insertions(+), 105 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index a939bac09..3f156e80d 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -62,50 +62,27 @@ void vk2cuda(
 
 	{
 		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
-			struct CUCleaner : video::ICleanup
-			{
-				CUexternalMemory mem = nullptr;
-				CUdeviceptr ptr = {};
-				core::smart_refctd_ptr <video::CCUDAHandler> cudaHandler = nullptr;
-				core::smart_refctd_ptr <video::CCUDADevice> cudaDevice = nullptr;
-
-				~CUCleaner()
-				{
-					auto& cu = cudaHandler->getCUDAFunctionTable();
-					ASSERT_SUCCESS(cu.pcuMemFree_v2(ptr));
-					ASSERT_SUCCESS(cu.pcuDestroyExternalMemory(mem));
-				}
-			};
-
-			auto cleaner = std::make_unique<CUCleaner>();
-			cleaner->cudaHandler = cudaHandler;
-			cleaner->cudaDevice = cudaDevice;
-			CUexternalMemory* mem = &cleaner->mem;
-			CUdeviceptr* ptr = &cleaner->ptr;
 			auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
 				{ {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-				{{.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
+				{{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
 				cpuBuf->getPointer());
 			assert(buf.get());
-			CUDA_EXTERNAL_MEMORY_HANDLE_DESC handleDesc = {
-				.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
-				.handle = {.win32 = {.handle = buf->getExternalHandle()}},
-				.size = buf->getMemoryReqs().size,
-			};
-			CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = { .size = buf->getMemoryReqs().size };
-			ASSERT_SUCCESS(cu.pcuImportExternalMemory(mem, &handleDesc));
-			ASSERT_SUCCESS(cu.pcuExternalMemoryGetMappedBuffer(ptr, *mem, &bufferDesc));
-			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUexternalMemory, CUdeviceptr>{std::move(buf), * mem, * ptr};
+			return buf;
 		};
 
-		auto [buf0, mem0, ptr0] = createBuffer(cpubuffers[0]);
-		auto [buf1, mem1, ptr1] = createBuffer(cpubuffers[1]);
-		auto [buf2, mem2, ptr2] = createBuffer(cpubuffers[2]);
+		auto buf0 = createBuffer(cpubuffers[0]);
+		auto buf1 = createBuffer(cpubuffers[1]);
+		auto buf2 = createBuffer(cpubuffers[2]);
+		
+		video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2;
 
-		void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf0.get(), &mem0));
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf1.get(), &mem1));
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf2.get(), &mem2));
 
+		void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements };
 		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptr2, size, stream));
+		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem2.ptr, size, stream));
 		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
@@ -117,6 +94,7 @@ void vk2cuda(
 	}
 }
 
+
 void cuda2vk(
 	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler,
 	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice,
@@ -137,79 +115,20 @@ void cuda2vk(
 			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
 
 	{
-		auto createBuffer = [&]{
-			struct CUCleaner : video::ICleanup
-			{
-				CUdeviceptr ptr = {};
-				size_t sz = 0;
-				CUmemGenericAllocationHandle mem = 0;
-				core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler = nullptr;
-				core::smart_refctd_ptr<video::CCUDADevice> cudaDevice = nullptr;
-
-				~CUCleaner()
-				{
-					auto& cu = cudaHandler->getCUDAFunctionTable();
-					ASSERT_SUCCESS(cu.pcuMemUnmap(ptr, sz));
-					ASSERT_SUCCESS(cu.pcuMemAddressFree(ptr, sz));
-					ASSERT_SUCCESS(cu.pcuMemRelease(mem));
-				}
-			};
-			auto& cu = cudaHandler->getCUDAFunctionTable();
-			auto cleaner = std::make_unique<CUCleaner>();
-			cleaner->cudaHandler = cudaHandler;
-			cleaner->cudaDevice = cudaDevice;
-
-			CUdeviceptr* ptr = &cleaner->ptr;
-			CUmemGenericAllocationHandle* mem = &cleaner->mem;
-			void* handle = 0;
-			size_t* sz = &cleaner->sz;
-			size_t granularity = 0;
-
-			uint32_t metaData[16] = { 48 };
-			CUmemAllocationProp prop = {
-				.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-				.requestedHandleTypes = CU_MEM_HANDLE_TYPE_WIN32,
-				.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = cudaDevice->getInternalObject() },
-				.win32HandleMetaData = metaData,
-			};
-
-			ASSERT_SUCCESS(cu.pcuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-			*sz = ((size - 1) / granularity + 1) * granularity;
-			ASSERT_SUCCESS(cu.pcuMemCreate(mem, *sz, &prop, 0));
-			ASSERT_SUCCESS(cu.pcuMemExportToShareableHandle(&handle, *mem, CU_MEM_HANDLE_TYPE_WIN32, 0));
-			ASSERT_SUCCESS(cu.pcuMemAddressReserve(ptr, *sz, 0, 0, 0));
-			ASSERT_SUCCESS(cu.pcuMemMap(*ptr, *sz, 0, *mem, 0));
-			CUmemAccessDesc accessDesc = {
-				.location = prop.location,
-				.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
-			};
-			ASSERT_SUCCESS(cu.pcuMemSetAccess(*ptr, *sz, &accessDesc, 1));
-
-			auto buf = logicalDevice->createBuffer(
-				{ {.size = *sz, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT },
-				{ {.preDestroyCleanup = std::move(cleaner), .externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32, .externalHandle = handle}} });
-
-			assert(buf.get());
-
-			auto reqs = buf->getMemoryReqs();
-			reqs.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-			auto alloc = logicalDevice->allocate(reqs, buf.get());
-			assert(alloc.memory && alloc.offset != video::IDeviceMemoryAllocator::InvalidMemoryOffset);
-			return std::tuple< core::smart_refctd_ptr<video::IGPUBuffer>, CUdeviceptr>{std::move(buf), * ptr};
-		};
-
-		auto [buf0, ptr0] = createBuffer();
-		auto [buf1, ptr1] = createBuffer();
-		auto [buf2, ptr2] = createBuffer();
-
-		void* parameters[] = { &ptr0, &ptr1, &ptr2, &numElements };
-		
-		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr0, cpubuffers[0]->getPointer(), size, stream));
-		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptr1, cpubuffers[1]->getPointer(), size, stream));
+		video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2;
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem0));
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem1));
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem2));
+
+		void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements };
+		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem0.ptr, cpubuffers[0]->getPointer(), size, stream));
+		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem1.ptr, cpubuffers[1]->getPointer(), size, stream));
 		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
 		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
 		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
-		util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<video::IGPUBuffer>{.offset = 0, .size = size, .buffer = buf2}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]);
+
+		auto buf = cudaDevice->exportGPUBuffer(mem2, logicalDevice);
+		util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<video::IGPUBuffer>{.offset = 0, .size = size, .buffer = buf}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]);
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
@@ -260,6 +179,7 @@ int main(int argc, char** argv)
 	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
 	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
+
 	cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
 	
 	ASSERT_SUCCESS(cu.pcuModuleUnload(module));

From 831e61e21f1ebe70458c3b3f77bc40791a5363d8 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 15 Jul 2023 23:21:14 +0300
Subject: [PATCH 05/17] semaphore interop

---
 63.CUDAInterop/main.cpp | 194 ++++++++++++++++++++++++++++++----------
 1 file changed, 145 insertions(+), 49 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index 3f156e80d..2aef06a09 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -64,26 +64,27 @@ void vk2cuda(
 		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
 			auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
 				{ {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-				{{.externalMemoryHandType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
+				{{.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
 				cpuBuf->getPointer());
 			assert(buf.get());
 			return buf;
 		};
 
-		auto buf0 = createBuffer(cpubuffers[0]);
-		auto buf1 = createBuffer(cpubuffers[1]);
-		auto buf2 = createBuffer(cpubuffers[2]);
-		
-		video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2;
+		core::smart_refctd_ptr<video::IGPUBuffer> buf[3] = {
+			createBuffer(cpubuffers[0]),
+			createBuffer(cpubuffers[1]),
+			createBuffer(cpubuffers[2]),
+		};
 
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf0.get(), &mem0));
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf1.get(), &mem1));
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(buf2.get(), &mem2));
+		std::array<core::smart_refctd_ptr<video::CCUDADevice::SSharedCUDAMemory>, 3> mem = {};
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get()));
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get()));
+		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get()));
 
-		void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements };
+		void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements };
 		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem2.ptr, size, stream));
-		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
+		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem[2]->ptr, size, stream));
+		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
@@ -94,50 +95,146 @@ void vk2cuda(
 	}
 }
 
-
-void cuda2vk(
-	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler,
-	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice,
-	video::IUtilities* util,
-	video::ILogicalDevice* logicalDevice,
-	nbl::video::IGPUQueue** queues,
-	CUfunction kernel,
-	CUstream stream,
-	int = 0)
+struct CUDA2VK
 {
-	auto& cu = cudaHandler->getCUDAFunctionTable();
+	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler;
+	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice;
+	video::IUtilities* util;
+	video::ILogicalDevice* logicalDevice;
+	nbl::video::IGPUQueue** queues;
 
-	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size) };
-	for (auto j = 0; j < 2; j++)
-		for (auto i = 0; i < numElements; i++)
-			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
+	std::array<core::smart_refctd_ptr<asset::ICPUBuffer>, 2> cpubuffers;
+	std::array<core::smart_refctd_ptr<video::CCUDADevice::SSharedCUDAMemory>, 3> mem = {};
+	core::smart_refctd_ptr<video::CCUDADevice::SExternalCUDASemaphore> cusema;
+	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf;
+	core::smart_refctd_ptr<video::IGPUSemaphore> sema;
+	core::smart_refctd_ptr<video::IGPUCommandPool> commandPool;
+	core::smart_refctd_ptr<video::IGPUCommandBuffer> cmd;
+	core::smart_refctd_ptr<video::IGPUFence> fence;
 
+	CUDA2VK(
+		core::smart_refctd_ptr<video::CCUDAHandler> _cudaHandler,
+		core::smart_refctd_ptr<video::CCUDADevice> _cudaDevice,
+		video::IUtilities* _util,
+		video::ILogicalDevice* _logicalDevice,
+		video::IGPUQueue** _queues)
+		: cudaHandler(std::move(_cudaHandler))
+		, cudaDevice(std::move(_cudaDevice))
+		, util(_util)
+		, logicalDevice(_logicalDevice)
+		, queues(_queues)
 	{
-		video::CCUDADevice::SSharedCUDAMemory mem0, mem1, mem2;
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem0));
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem1));
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(size, sizeof(float), &mem2));
-
-		void* parameters[] = { &mem0.ptr, &mem1.ptr, &mem2.ptr, &numElements };
-		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem0.ptr, cpubuffers[0]->getPointer(), size, stream));
-		ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem1.ptr, cpubuffers[1]->getPointer(), size, stream));
-		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
-		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuCtxSynchronize());
+		createResources();
+	}
+
+	void createResources()
+	{
+		auto& cu = cudaHandler->getCUDAFunctionTable();
 
-		auto buf = cudaDevice->exportGPUBuffer(mem2, logicalDevice);
-		util->downloadBufferRangeViaStagingBufferAutoSubmit(asset::SBufferRange<video::IGPUBuffer>{.offset = 0, .size = size, .buffer = buf}, cpubuffers[2]->getPointer(), queues[CommonAPI::InitOutput::EQT_COMPUTE]);
+		for (auto& buf : cpubuffers)
+			buf = core::make_smart_refctd_ptr<asset::ICPUBuffer>(size);
+
+		for (auto j = 0; j < 2; j++)
+			for (auto i = 0; i < numElements; i++)
+				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
+
+		sema = logicalDevice->createSemaphore({ .externalHandleType = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
+		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
+
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float)));
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float)));
+		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float)));
+		importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice);
+
+		fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
+		commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {});
+		bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd);
+		assert(re);
+
+		stagingbuf = logicalDevice->createBuffer({ {.size = importedbuf->getSize(), .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+		auto req = stagingbuf->getMemoryReqs();
+		req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+		auto allocation = logicalDevice->allocate(req, stagingbuf.get());
+		assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
+		assert(stagingbuf->getBoundMemory()->isMappable());
+		logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(stagingbuf->getBoundMemory(), stagingbuf->getBoundMemoryOffset(), stagingbuf->getSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
+		assert(stagingbuf->getBoundMemory()->getMappedPointer());
+		memset(stagingbuf->getBoundMemory()->getMappedPointer(), 0, stagingbuf->getSize());
+	}
+
+	void launchKernel(CUfunction kernel, CUstream stream)
+	{
+		auto queue = queues[CommonAPI::InitOutput::EQT_COMPUTE];
+
+		auto& cu = cudaHandler->getCUDAFunctionTable();
+		// Launch kernel
+		{
+			void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements };
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[0]->ptr, cpubuffers[0]->getPointer(), size, stream));
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[1]->ptr, cpubuffers[1]->getPointer(), size, stream));
+			ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {};
+			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&cusema->semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+		}
+
+		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+		{
+			video::IGPUSemaphore* waitSemaphores[] = { sema.get() };
+			asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT };
+			video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() };
+
+			video::IGPUCommandBuffer::SBufferMemoryBarrier barrier = {
+				.barrier = {
+					.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH ,
+				},
+				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
+				.dstQueueFamilyIndex = queue->getFamilyIndex(),
+				.buffer = importedbuf,
+				.offset = 0,
+				.size = VK_WHOLE_SIZE,
+			};
+			bool re = true;
+			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); 	// Ownership transfer?
+			asset::SBufferCopy region = { .size = importedbuf->getSize() };
+			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
+			re &= cmd->end();
+
+			video::IGPUQueue::SSubmitInfo submitInfo = {
+				.waitSemaphoreCount = 1,
+				.pWaitSemaphores = waitSemaphores,
+				.pWaitDstStageMask = waitStages,
+				.commandBufferCount = 1,
+				.commandBuffers = cmdBuffers
+			};
+
+			re &= queue->submit(1, &submitInfo, fence.get());
+			assert(re);
+		}
+
+		ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+	}
+
+	void kernelCallback()
+	{
+		// Make sure we are also done with the readback
+		{
+			video::IGPUFence* fences[] = { fence.get() };
+			auto status = logicalDevice->waitForFences(1, fences, true, -1);
+			assert(video::IGPUFence::ES_SUCCESS == status);
+		}
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-		float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
-		
+		float* C = reinterpret_cast<float*>(stagingbuf->getBoundMemory()->getMappedPointer());
 		for (auto i = 0; i < numElements; i++)
 			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
+		
+		std::cout << "Success\n";
+
+		delete this;
 	}
-}
+};
 
 int main(int argc, char** argv)
 {
@@ -159,7 +256,7 @@ int main(int argc, char** argv)
 	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
 	assert(cudaHandler);
 	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
-	auto& cu = cudaHandler->getCUDAFunctionTable();
+	auto& cu = cudaHandler->getCUDAFunctionTable();	
 
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	CUmodule   module;
@@ -180,10 +277,9 @@ int main(int argc, char** argv)
 
 	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
 
-	cuda2vk(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
-	
+	(new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream);
+
 	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-
 	return 0;
 }

From 1396829b405f2a6068069f786cfcf5eec4cd3db7 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Mon, 17 Jul 2023 01:21:12 +0300
Subject: [PATCH 06/17] update

---
 63.CUDAInterop/main.cpp | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index 2aef06a09..89b7be2c3 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -6,6 +6,8 @@
 #include <nabla.h>
 
 #include "nbl/video/CCUDAHandler.h"
+#include "nbl/video/CCUDASharedMemory.h"
+#include "nbl/video/CCUDASharedSemaphore.h"
 
 #include "../common/CommonAPI.h"
 
@@ -64,7 +66,7 @@ void vk2cuda(
 		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
 			auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
 				{ {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-				{{.externalHandleType = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
+				{{.externalHandleTypes = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
 				cpuBuf->getPointer());
 			assert(buf.get());
 			return buf;
@@ -76,14 +78,15 @@ void vk2cuda(
 			createBuffer(cpubuffers[2]),
 		};
 
-		std::array<core::smart_refctd_ptr<video::CCUDADevice::SSharedCUDAMemory>, 3> mem = {};
+		std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
 		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get()));
 		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get()));
 		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get()));
 
-		void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements };
+		CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() };
+		void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements};
 		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), mem[2]->ptr, size, stream));
+		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptrs[2], size, stream));
 		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
@@ -95,6 +98,7 @@ void vk2cuda(
 	}
 }
 
+
 struct CUDA2VK
 {
 	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler;
@@ -104,8 +108,8 @@ struct CUDA2VK
 	nbl::video::IGPUQueue** queues;
 
 	std::array<core::smart_refctd_ptr<asset::ICPUBuffer>, 2> cpubuffers;
-	std::array<core::smart_refctd_ptr<video::CCUDADevice::SSharedCUDAMemory>, 3> mem = {};
-	core::smart_refctd_ptr<video::CCUDADevice::SExternalCUDASemaphore> cusema;
+	std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
+	core::smart_refctd_ptr<video::CCUDASharedSemaphore> cusema;
 	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf;
 	core::smart_refctd_ptr<video::IGPUSemaphore> sema;
 	core::smart_refctd_ptr<video::IGPUCommandPool> commandPool;
@@ -138,14 +142,14 @@ struct CUDA2VK
 			for (auto i = 0; i < numElements; i++)
 				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
 
-		sema = logicalDevice->createSemaphore({ .externalHandleType = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
+		sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 
 		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float)));
 		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float)));
 		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float)));
 		importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice);
-
+		assert(importedbuf);
 		fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
 		commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {});
 		bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd);
@@ -169,12 +173,14 @@ struct CUDA2VK
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 		// Launch kernel
 		{
-			void* parameters[] = { &mem[0]->ptr, &mem[1]->ptr, &mem[2]->ptr, &numElements };
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[0]->ptr, cpubuffers[0]->getPointer(), size, stream));
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(mem[1]->ptr, cpubuffers[1]->getPointer(), size, stream));
+			CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() };
+			void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream));
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream));
 			ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
 			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {};
-			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&cusema->semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+			auto semaphore = cusema->getInternalObject();
+			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
 		}
 
 		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
@@ -195,7 +201,7 @@ struct CUDA2VK
 			};
 			bool re = true;
 			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
-			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); 	// Ownership transfer?
+			// re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); 	// Ownership transfer?
 			asset::SBufferCopy region = { .size = importedbuf->getSize() };
 			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
 			re &= cmd->end();
@@ -275,10 +281,12 @@ int main(int argc, char** argv)
 	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
 	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
 
+	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
 	(new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream);
 
+	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+
 	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
 	return 0;

From d7fb50ec91995dc5769d2900e5c5d20f5204124c Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 27 Jul 2023 18:25:08 +0300
Subject: [PATCH 07/17] texture sharing

---
 63.CUDAInterop/main.cpp | 205 ++++++++++++++++++++++++----------------
 1 file changed, 122 insertions(+), 83 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index 89b7be2c3..b249ded89 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -39,66 +39,11 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \
 	abort(); \
 }
 
-constexpr uint32_t gridDim[3] = { 4096,1,1 };
-constexpr uint32_t blockDim[3] = { 1024,1,1 };
+constexpr uint32_t gridDim[3] = { 256,1,1 };
+constexpr uint32_t blockDim[3] = { 256,1,1 };
 size_t numElements = gridDim[0] * blockDim[0];
 size_t size = sizeof(float) * numElements;
 
-void vk2cuda(
-	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler, 
-	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice, 
-	video::IUtilities * util,
-	video::ILogicalDevice* logicalDevice,
-	nbl::video::IGPUQueue ** queues,
-	CUfunction kernel,
-	CUstream stream,
-	int=0)
-{
-	auto& cu = cudaHandler->getCUDAFunctionTable();
-	core::smart_refctd_ptr<asset::ICPUBuffer> cpubuffers[3] = { core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size),
-																core::make_smart_refctd_ptr<asset::ICPUBuffer>(size) };
-	for (auto j = 0; j < 2; j++)
-		for (auto i = 0; i < numElements; i++)
-			reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
-
-	{
-		auto createBuffer = [&](core::smart_refctd_ptr<asset::ICPUBuffer>const& cpuBuf) {
-			auto buf = util->createFilledDeviceLocalBufferOnDedMem(queues[CommonAPI::InitOutput::EQT_COMPUTE],
-				{ {.size = size, .usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT},
-				{{.externalHandleTypes = video::IDeviceMemoryBacked::EHT_OPAQUE_WIN32}} },
-				cpuBuf->getPointer());
-			assert(buf.get());
-			return buf;
-		};
-
-		core::smart_refctd_ptr<video::IGPUBuffer> buf[3] = {
-			createBuffer(cpubuffers[0]),
-			createBuffer(cpubuffers[1]),
-			createBuffer(cpubuffers[2]),
-		};
-
-		std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[0], buf[0].get()));
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[1], buf[1].get()));
-		ASSERT_SUCCESS(cudaDevice->importGPUBuffer(&mem[2], buf[2].get()));
-
-		CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() };
-		void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements};
-		ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-		ASSERT_SUCCESS(cu.pcuMemcpyDtoHAsync_v2(cpubuffers[2]->getPointer(), ptrs[2], size, stream));
-		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-
-		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
-		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-		float* C = reinterpret_cast<float*>(cpubuffers[2]->getPointer());
-
-		for (auto i = 0; i < numElements; i++)
-			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
-	}
-}
-
-
 struct CUDA2VK
 {
 	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler;
@@ -111,6 +56,7 @@ struct CUDA2VK
 	std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
 	core::smart_refctd_ptr<video::CCUDASharedSemaphore> cusema;
 	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf;
+	core::smart_refctd_ptr<video::IGPUImage> importedimg, stagingimg;
 	core::smart_refctd_ptr<video::IGPUSemaphore> sema;
 	core::smart_refctd_ptr<video::IGPUCommandPool> commandPool;
 	core::smart_refctd_ptr<video::IGPUCommandBuffer> cmd;
@@ -138,32 +84,86 @@ struct CUDA2VK
 		for (auto& buf : cpubuffers)
 			buf = core::make_smart_refctd_ptr<asset::ICPUBuffer>(size);
 
-		for (auto j = 0; j < 2; j++)
-			for (auto i = 0; i < numElements; i++)
-				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand();
+		//for (auto j = 0; j < 2; j++)
+		//	for (auto i = 0; i < numElements; i++)
+		//		reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX);
+
+		memset(cpubuffers[1]->getPointer(), 0, size);
+		uint16_t (*ptr)[2] = reinterpret_cast<uint16_t(*)[2]>(cpubuffers[0]->getPointer());
+		for (auto i = 0; i < gridDim[0]; i++)
+			for (auto j = 0; j < blockDim[0]; j++)
+			{
+				ptr[i * blockDim[0] + j][0] = i;
+				ptr[i * blockDim[0] + j][1] = j;
+			}
 
 		sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[0], size, sizeof(float)));
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[1], size, sizeof(float)));
-		ASSERT_SUCCESS(cudaDevice->createExportableMemory(&mem[2], size, sizeof(float)));
-		importedbuf = cudaDevice->exportGPUBuffer(mem[2].get(), logicalDevice);
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
+
+		importedbuf = mem[2]->exportAsBuffer(logicalDevice, asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT);
+		importedimg = mem[2]->exportAsImage(logicalDevice,
+			asset::IImage::SCreationParams{
+				.type = asset::IImage::ET_2D,
+				.samples = asset::IImage::ESCF_1_BIT,
+				.format  = asset::EF_R32_SFLOAT,
+				.extent = { gridDim[0], blockDim[0], 1 },
+				.mipLevels = 1,
+				.arrayLayers = 1,
+				.usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT,
+			});
+
 		assert(importedbuf);
 		fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
 		commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {});
 		bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd);
 		assert(re);
 
-		stagingbuf = logicalDevice->createBuffer({ {.size = importedbuf->getSize(), .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-		auto req = stagingbuf->getMemoryReqs();
-		req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-		auto allocation = logicalDevice->allocate(req, stagingbuf.get());
-		assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
-		assert(stagingbuf->getBoundMemory()->isMappable());
-		logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(stagingbuf->getBoundMemory(), stagingbuf->getBoundMemoryOffset(), stagingbuf->getSize()), video::IDeviceMemoryAllocation::EMCAF_READ);
-		assert(stagingbuf->getBoundMemory()->getMappedPointer());
-		memset(stagingbuf->getBoundMemory()->getMappedPointer(), 0, stagingbuf->getSize());
+		auto createStaging = [logicalDevice=logicalDevice]()
+		{
+			auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+			auto req = buf->getMemoryReqs();
+			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+			auto allocation = logicalDevice->allocate(req, buf.get());
+			assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
+			assert(buf->getBoundMemory()->isMappable());
+			logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(buf->getBoundMemory(), buf->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ);
+			assert(buf->getBoundMemory()->getMappedPointer());
+			memset(buf->getBoundMemory()->getMappedPointer(), 0, req.size);
+			return buf;
+		};
+
+		auto createStagingImg = [logicalDevice = logicalDevice]()
+		{
+			video::IGPUImage::SCreationParams params = {{
+				.type = asset::IImage::ET_2D,
+				.samples = asset::IImage::ESCF_1_BIT,
+				.format = asset::EF_R32_SFLOAT,
+				.extent = { gridDim[0], blockDim[0], 1 },
+				.mipLevels = 1,
+				.arrayLayers = 1,
+				.usage = asset::IImage::EUF_TRANSFER_DST_BIT 
+			}};
+			params.tiling = video::IGPUImage::ET_LINEAR;
+
+			auto img = logicalDevice->createImage(std::move(params));
+			auto req = img->getMemoryReqs();
+			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+			auto allocation = logicalDevice->allocate(req, img.get());
+			assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
+			assert(img->getBoundMemory()->isMappable());
+			size_t sz = img->getImageDataSizeInBytes();
+			logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(img->getBoundMemory(), img->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ);
+			assert(img->getBoundMemory()->getMappedPointer());
+			memset(img->getBoundMemory()->getMappedPointer(), 0, req.size);
+			return img;
+		};
+
+		stagingbuf = createStaging();
+		stagingimg = createStagingImg();
 	}
 
 	void launchKernel(CUfunction kernel, CUstream stream)
@@ -189,21 +189,54 @@ struct CUDA2VK
 			asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT };
 			video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() };
 
-			video::IGPUCommandBuffer::SBufferMemoryBarrier barrier = {
-				.barrier = {
-					.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH ,
-				},
+			video::IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = {
+				.barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
 				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
 				.dstQueueFamilyIndex = queue->getFamilyIndex(),
 				.buffer = importedbuf,
 				.offset = 0,
 				.size = VK_WHOLE_SIZE,
 			};
+
+			video::IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[2] = {{
+				.barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
+				.newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL,
+				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
+				.dstQueueFamilyIndex = queue->getFamilyIndex(),
+				.image = importedimg,
+				.subresourceRange = {
+					.aspectMask = asset::IImage::EAF_COLOR_BIT,
+					.levelCount = 1u,
+					.layerCount = 1u,
+				}}, {
+				.barrier = {.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
+				.newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL,
+				.image = stagingimg,
+				.subresourceRange = {
+					.aspectMask = asset::IImage::EAF_COLOR_BIT,
+					.levelCount = 1u,
+					.layerCount = 1u,
+				}}
+			};
+
 			bool re = true;
 			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
-			// re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &barrier, 0u, nullptr); 	// Ownership transfer?
+			/*Acquire?*/
+			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 2u, imgBarriers); 	// Ownership transfer?
 			asset::SBufferCopy region = { .size = importedbuf->getSize() };
 			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
+			asset::IImage::SImageCopy imgRegion = { 
+				.srcSubresource = {
+					.aspectMask = imgBarriers[0].subresourceRange.aspectMask,
+					.layerCount = imgBarriers[0].subresourceRange.layerCount,
+				},
+				.dstSubresource = {
+					.aspectMask = imgBarriers[1].subresourceRange.aspectMask,
+					.layerCount = imgBarriers[1].subresourceRange.layerCount,
+				},
+				.extent = importedimg->getCreationParameters().extent
+			};
+			re &= cmd->copyImage(importedimg.get(), imgBarriers[0].newLayout, stagingimg.get(), imgBarriers[1].newLayout, 1, &imgRegion);
 			re &= cmd->end();
 
 			video::IGPUQueue::SSubmitInfo submitInfo = {
@@ -217,7 +250,7 @@ struct CUDA2VK
 			re &= queue->submit(1, &submitInfo, fence.get());
 			assert(re);
 		}
-
+        
 		ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
 	}
 
@@ -232,9 +265,16 @@ struct CUDA2VK
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-		float* C = reinterpret_cast<float*>(stagingbuf->getBoundMemory()->getMappedPointer());
+		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory()->getMappedPointer());
+		float* CImg = reinterpret_cast<float*>(stagingimg->getBoundMemory()->getMappedPointer());
+
+		assert(!memcmp(CBuf, CImg, size));
+
 		for (auto i = 0; i < numElements; i++)
-			assert(abs(C[i] - A[i] - B[i]) < 0.01f);
+		{
+			assert(abs(CBuf[i] - A[i] - B[i]) < 0.01f);
+			assert(abs(CImg[i] - A[i] - B[i]) < 0.01f);
+		}
 		
 		std::cout << "Success\n";
 
@@ -258,6 +298,7 @@ int main(int argc, char** argv)
 	auto& queues = initOutput.queues;
 	auto& logger = initOutput.logger;
 	
+
 	assert(physicalDevice->getLimits().externalMemory);
 	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
 	assert(cudaHandler);
@@ -281,8 +322,6 @@ int main(int argc, char** argv)
 	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
 	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-
-	vk2cuda(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data(), kernel, stream);
 	(new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream);
 
 	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));

From 682a8c1e685afea9d72c2984ddea615e12d250ce Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 6 Aug 2023 23:38:12 +0300
Subject: [PATCH 08/17] update to latest changes

---
 63.CUDAInterop/main.cpp | 173 ++++++++++++++++++----------------------
 1 file changed, 76 insertions(+), 97 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index b249ded89..58a6c0c62 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -39,8 +39,8 @@ if (auto re = expr; NVRTC_SUCCESS != re) { \
 	abort(); \
 }
 
-constexpr uint32_t gridDim[3] = { 256,1,1 };
-constexpr uint32_t blockDim[3] = { 256,1,1 };
+constexpr uint32_t gridDim[3] = { 4096,1,1 };
+constexpr uint32_t blockDim[3] = { 1024,1,1 };
 size_t numElements = gridDim[0] * blockDim[0];
 size_t size = sizeof(float) * numElements;
 
@@ -55,8 +55,9 @@ struct CUDA2VK
 	std::array<core::smart_refctd_ptr<asset::ICPUBuffer>, 2> cpubuffers;
 	std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
 	core::smart_refctd_ptr<video::CCUDASharedSemaphore> cusema;
-	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf;
-	core::smart_refctd_ptr<video::IGPUImage> importedimg, stagingimg;
+
+	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf, stagingbuf2;
+	core::smart_refctd_ptr<video::IGPUImage> importedimg;
 	core::smart_refctd_ptr<video::IGPUSemaphore> sema;
 	core::smart_refctd_ptr<video::IGPUCommandPool> commandPool;
 	core::smart_refctd_ptr<video::IGPUCommandBuffer> cmd;
@@ -84,39 +85,46 @@ struct CUDA2VK
 		for (auto& buf : cpubuffers)
 			buf = core::make_smart_refctd_ptr<asset::ICPUBuffer>(size);
 
-		//for (auto j = 0; j < 2; j++)
-		//	for (auto i = 0; i < numElements; i++)
-		//		reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX);
-
-		memset(cpubuffers[1]->getPointer(), 0, size);
-		uint16_t (*ptr)[2] = reinterpret_cast<uint16_t(*)[2]>(cpubuffers[0]->getPointer());
-		for (auto i = 0; i < gridDim[0]; i++)
-			for (auto j = 0; j < blockDim[0]; j++)
-			{
-				ptr[i * blockDim[0] + j][0] = i;
-				ptr[i * blockDim[0] + j][1] = j;
-			}
+		for (auto j = 0; j < 2; j++)
+			for (auto i = 0; i < numElements; i++)
+				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX);
 
 		sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { size, sizeof(float), CU_MEM_LOCATION_TYPE_DEVICE }));
-
-		importedbuf = mem[2]->exportAsBuffer(logicalDevice, asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT);
-		importedimg = mem[2]->exportAsImage(logicalDevice,
-			asset::IImage::SCreationParams{
-				.type = asset::IImage::ET_2D,
-				.samples = asset::IImage::ESCF_1_BIT,
-				.format  = asset::EF_R32_SFLOAT,
-				.extent = { gridDim[0], blockDim[0], 1 },
-				.mipLevels = 1,
-				.arrayLayers = 1,
-				.usage = asset::IImage::EUF_STORAGE_BIT | asset::IImage::EUF_TRANSFER_SRC_BIT,
-			});
-
-		assert(importedbuf);
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		
+		{
+			auto devmemory = mem[2]->exportAsMemory(logicalDevice);
+			assert(devmemory);
+			video::IGPUBuffer::SCreationParams params = {};
+			params.size = devmemory->getAllocationSize();
+			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+			params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+			importedbuf = logicalDevice->createBuffer(std::move(params));
+			assert(importedbuf);
+			bool re = logicalDevice->bindBufferMemory(video::ILogicalDevice::SBindBufferMemoryInfo{.buffer = importedbuf.get(), .memory = devmemory.get() });
+			assert(re);
+		}
+
+		{
+			
+			video::IGPUImage::SCreationParams params = {};
+			params.type = video::IGPUImage::ET_2D;
+			params.samples = video::IGPUImage::ESCF_1_BIT;
+			params.format = asset::EF_R32_SFLOAT;
+			params.extent = { gridDim[0], blockDim[0], 1 };
+			params.mipLevels = 1;
+			params.arrayLayers = 1;
+			params.usage = video::IGPUImage::EUF_STORAGE_BIT | video::IGPUImage::EUF_TRANSFER_SRC_BIT;
+			params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+			params.tiling = video::IGPUImage::ET_LINEAR;
+			importedimg = mem[2]->exportAsImage(logicalDevice, std::move(params));
+			assert(importedimg);
+		}
+		
 		fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
 		commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {});
 		bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd);
@@ -136,34 +144,8 @@ struct CUDA2VK
 			return buf;
 		};
 
-		auto createStagingImg = [logicalDevice = logicalDevice]()
-		{
-			video::IGPUImage::SCreationParams params = {{
-				.type = asset::IImage::ET_2D,
-				.samples = asset::IImage::ESCF_1_BIT,
-				.format = asset::EF_R32_SFLOAT,
-				.extent = { gridDim[0], blockDim[0], 1 },
-				.mipLevels = 1,
-				.arrayLayers = 1,
-				.usage = asset::IImage::EUF_TRANSFER_DST_BIT 
-			}};
-			params.tiling = video::IGPUImage::ET_LINEAR;
-
-			auto img = logicalDevice->createImage(std::move(params));
-			auto req = img->getMemoryReqs();
-			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-			auto allocation = logicalDevice->allocate(req, img.get());
-			assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
-			assert(img->getBoundMemory()->isMappable());
-			size_t sz = img->getImageDataSizeInBytes();
-			logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(img->getBoundMemory(), img->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ);
-			assert(img->getBoundMemory()->getMappedPointer());
-			memset(img->getBoundMemory()->getMappedPointer(), 0, req.size);
-			return img;
-		};
-
 		stagingbuf = createStaging();
-		stagingimg = createStagingImg();
+		stagingbuf2 = createStaging();
 	}
 
 	void launchKernel(CUfunction kernel, CUstream stream)
@@ -173,7 +155,11 @@ struct CUDA2VK
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 		// Launch kernel
 		{
-			CUdeviceptr ptrs[] = { mem[0]->getDevicePtr(), mem[1]->getDevicePtr(), mem[2]->getDevicePtr() };
+			CUdeviceptr ptrs[] = {
+				mem[0]->getDeviceptr(),
+				mem[1]->getDeviceptr(),
+				mem[2]->getDeviceptr(),
+			};
 			void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
 			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream));
 			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream));
@@ -182,7 +168,7 @@ struct CUDA2VK
 			auto semaphore = cusema->getInternalObject();
 			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
 		}
-
+		
 		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
 		{
 			video::IGPUSemaphore* waitSemaphores[] = { sema.get() };
@@ -195,11 +181,19 @@ struct CUDA2VK
 				.dstQueueFamilyIndex = queue->getFamilyIndex(),
 				.buffer = importedbuf,
 				.offset = 0,
-				.size = VK_WHOLE_SIZE,
+				.size = size,
 			};
 
-			video::IGPUCommandBuffer::SImageMemoryBarrier imgBarriers[2] = {{
+			bool re = true;
+			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+			/*Acquire?*/
+			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 0u, nullptr); 	// Ownership transfer?
+			asset::SBufferCopy region = { .size = size };
+			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
+
+			video::IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = {
 				.barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
+				.oldLayout = asset::IImage::EL_PREINITIALIZED,
 				.newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL,
 				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
 				.dstQueueFamilyIndex = queue->getFamilyIndex(),
@@ -208,35 +202,20 @@ struct CUDA2VK
 					.aspectMask = asset::IImage::EAF_COLOR_BIT,
 					.levelCount = 1u,
 					.layerCount = 1u,
-				}}, {
-				.barrier = {.dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
-				.newLayout = asset::IImage::EL_TRANSFER_DST_OPTIMAL,
-				.image = stagingimg,
-				.subresourceRange = {
-					.aspectMask = asset::IImage::EAF_COLOR_BIT,
-					.levelCount = 1u,
-					.layerCount = 1u,
-				}}
+				}
 			};
 
-			bool re = true;
-			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
-			/*Acquire?*/
-			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 2u, imgBarriers); 	// Ownership transfer?
-			asset::SBufferCopy region = { .size = importedbuf->getSize() };
-			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
-			asset::IImage::SImageCopy imgRegion = { 
-				.srcSubresource = {
-					.aspectMask = imgBarriers[0].subresourceRange.aspectMask,
-					.layerCount = imgBarriers[0].subresourceRange.layerCount,
-				},
-				.dstSubresource = {
-					.aspectMask = imgBarriers[1].subresourceRange.aspectMask,
-					.layerCount = imgBarriers[1].subresourceRange.layerCount,
+			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, &imgBarrier); 
+
+			asset::IImage::SBufferCopy imgRegion = {
+				.imageSubresource = {
+					.aspectMask = imgBarrier.subresourceRange.aspectMask,
+					.layerCount = imgBarrier.subresourceRange.layerCount,
 				},
-				.extent = importedimg->getCreationParameters().extent
+				.imageExtent = importedimg->getCreationParameters().extent,
 			};
-			re &= cmd->copyImage(importedimg.get(), imgBarriers[0].newLayout, stagingimg.get(), imgBarriers[1].newLayout, 1, &imgRegion);
+
+			re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion);
 			re &= cmd->end();
 
 			video::IGPUQueue::SSubmitInfo submitInfo = {
@@ -266,9 +245,9 @@ struct CUDA2VK
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
 		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory()->getMappedPointer());
-		float* CImg = reinterpret_cast<float*>(stagingimg->getBoundMemory()->getMappedPointer());
+		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory()->getMappedPointer());
 
-		assert(!memcmp(CBuf, CImg, size));
+		 assert(!memcmp(CBuf, CImg, size));
 
 		for (auto i = 0; i < numElements; i++)
 		{
@@ -277,8 +256,6 @@ struct CUDA2VK
 		}
 		
 		std::cout << "Success\n";
-
-		delete this;
 	}
 };
 
@@ -297,7 +274,6 @@ int main(int argc, char** argv)
 	auto& utilities = initOutput.utilities;
 	auto& queues = initOutput.queues;
 	auto& logger = initOutput.logger;
-	
 
 	assert(physicalDevice->getLimits().externalMemory);
 	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
@@ -322,11 +298,14 @@ int main(int argc, char** argv)
 	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
 	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-	(new CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data()))->launchKernel(kernel, stream);
-
-	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+	{
+		auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
+		cuda2vk.launchKernel(kernel, stream);
+		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+	}
 
 	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
 	return 0;
 }

From 6ce21d5c5c8026b6772f3e60e21096ee54353a81 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 4 Jan 2024 18:41:31 +0300
Subject: [PATCH 09/17] cuda-interop-vk13

---
 63.CUDAInterop/main.cpp          | 400 +++++++++++++++++++------------
 common/MonoDeviceApplication.hpp |   6 +-
 2 files changed, 253 insertions(+), 153 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index 58a6c0c62..0b30b9227 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -9,14 +9,13 @@
 #include "nbl/video/CCUDASharedMemory.h"
 #include "nbl/video/CCUDASharedSemaphore.h"
 
-#include "../common/CommonAPI.h"
+#include "../common./MonoSystemMonoLoggerApplication.hpp"
 
-/**
-This example just shows a screen which clears to red,
-nothing fancy, just to show that Irrlicht links fine
-**/
 using namespace nbl;
-
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
 
 /*
 The start of the main function starts like in most other example. We ask the
@@ -44,38 +43,124 @@ constexpr uint32_t blockDim[3] = { 1024,1,1 };
 size_t numElements = gridDim[0] * blockDim[0];
 size_t size = sizeof(float) * numElements;
 
-struct CUDA2VK
+#ifndef _NBL_COMPILE_WITH_CUDA_
+static_assert(false);
+#endif
+
+class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 {
-	core::smart_refctd_ptr<video::CCUDAHandler> cudaHandler;
-	core::smart_refctd_ptr<video::CCUDADevice> cudaDevice;
-	video::IUtilities* util;
-	video::ILogicalDevice* logicalDevice;
-	nbl::video::IGPUQueue** queues;
-
-	std::array<core::smart_refctd_ptr<asset::ICPUBuffer>, 2> cpubuffers;
-	std::array<core::smart_refctd_ptr<video::CCUDASharedMemory>, 3> mem = {};
-	core::smart_refctd_ptr<video::CCUDASharedSemaphore> cusema;
-
-	core::smart_refctd_ptr<video::IGPUBuffer> importedbuf, stagingbuf, stagingbuf2;
-	core::smart_refctd_ptr<video::IGPUImage> importedimg;
-	core::smart_refctd_ptr<video::IGPUSemaphore> sema;
-	core::smart_refctd_ptr<video::IGPUCommandPool> commandPool;
-	core::smart_refctd_ptr<video::IGPUCommandBuffer> cmd;
-	core::smart_refctd_ptr<video::IGPUFence> fence;
-
-	CUDA2VK(
-		core::smart_refctd_ptr<video::CCUDAHandler> _cudaHandler,
-		core::smart_refctd_ptr<video::CCUDADevice> _cudaDevice,
-		video::IUtilities* _util,
-		video::ILogicalDevice* _logicalDevice,
-		video::IGPUQueue** _queues)
-		: cudaHandler(std::move(_cudaHandler))
-		, cudaDevice(std::move(_cudaDevice))
-		, util(_util)
-		, logicalDevice(_logicalDevice)
-		, queues(_queues)
+	using base_t = examples::MonoSystemMonoLoggerApplication;
+public:
+	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
+	using base_t::base_t;
+
+	smart_refctd_ptr<CCUDAHandler> cudaHandler;
+	smart_refctd_ptr<CCUDADevice> cudaDevice;
+	// IUtilities* util;
+	smart_refctd_ptr<ILogicalDevice> logicalDevice;
+	IQueue* queue;
+
+	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpubuffers;
+	std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> mem = {};
+	smart_refctd_ptr<CCUDASharedSemaphore> cusema;
+
+	smart_refctd_ptr<IGPUBuffer> importedbuf, stagingbuf, stagingbuf2;
+	smart_refctd_ptr<IGPUImage> importedimg;
+	smart_refctd_ptr<ISemaphore> sema;
+	smart_refctd_ptr<IGPUCommandPool> commandPool;
+	smart_refctd_ptr<IGPUCommandBuffer> cmd;
+
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
+		// Remember to call the base class initialization!
+		if (!base_t::onAppInitialized(std::move(system)))
+			return false;
+		// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
+		// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!
+
+		// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
+		smart_refctd_ptr<CVulkanConnection> api;
+		{
+			// You generally want to default initialize any parameter structs
+			IAPIConnection::SFeatures apiFeaturesToEnable = {};
+			// generally you want to make your life easier during development
+			apiFeaturesToEnable.validations = true;
+			apiFeaturesToEnable.synchronizationValidation = true;
+			// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
+			apiFeaturesToEnable.debugUtils = true;
+			// create our Vulkan instance
+			if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable)))
+				return logFail("Failed to crate an IAPIConnection!");
+		}
+
+		// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
+		// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
+		IPhysicalDevice* physDev = nullptr;
+		ILogicalDevice::SCreationParams params = {};
+		// we will only deal with a single queue in this example
+		params.queueParamsCount = 1;
+		params.queueParams[0].count = 1;
+		params.featuresToEnable;
+		for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++)
+		{
+			const auto familyProps = (*physDevIt)->getQueueFamilyProperties();
+			// this is the only "complicated" part, we want to create a queue that supports compute pipelines
+			for (auto i = 0; i < familyProps.size(); i++)
+				if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT))
+				{
+					physDev = *physDevIt;
+					params.queueParams[0].familyIndex = i;
+					break;
+				}
+		}
+		if (!physDev)
+			return logFail("Failed to find any Physical Devices with Compute capable Queue Families!");
+
+		{
+			auto& limits = physDev->getLimits();
+			if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32)
+				return logFail("Physical device does not support the required extensions");
+			
+			cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
+			assert(cudaHandler);
+			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(api), physDev);
+		}
+
+		// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
+		logicalDevice = physDev->createLogicalDevice(std::move(params));
+		if (!logicalDevice)
+			return logFail("Failed to create a Logical Device!");
+		
+		queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0);
+		
 		createResources();
+
+		smart_refctd_ptr<ICPUBuffer> ptx;
+		{
+			ISystem::future_t<smart_refctd_ptr<IFile>> fut;
+			m_system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
+			auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
+			ASSERT_SUCCESS_NV(res);
+			ptx = std::move(ptx_);
+		}
+		CUmodule   module;
+		CUfunction kernel;
+		CUstream   stream;
+
+		auto& cu = cudaHandler->getCUDAFunctionTable();
+
+		ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+		ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+		ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+		launchKernel(kernel, stream);
+
+		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+		ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+		ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
+		return true;
 	}
 
 	void createResources()
@@ -83,51 +168,52 @@ struct CUDA2VK
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 
 		for (auto& buf : cpubuffers)
-			buf = core::make_smart_refctd_ptr<asset::ICPUBuffer>(size);
+			buf = make_smart_refctd_ptr<ICPUBuffer>(size);
 
 		for (auto j = 0; j < 2; j++)
 			for (auto i = 0; i < numElements; i++)
 				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX);
 
-		sema = logicalDevice->createSemaphore({ .externalHandleTypes = video::IGPUSemaphore::EHT_OPAQUE_WIN32 });
-		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
+
 
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		
+		sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 		{
-			auto devmemory = mem[2]->exportAsMemory(logicalDevice);
+			auto devmemory = mem[2]->exportAsMemory(logicalDevice.get());
 			assert(devmemory);
-			video::IGPUBuffer::SCreationParams params = {};
+			IGPUBuffer::SCreationParams params = {};
 			params.size = devmemory->getAllocationSize();
 			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-			params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
 			importedbuf = logicalDevice->createBuffer(std::move(params));
 			assert(importedbuf);
-			bool re = logicalDevice->bindBufferMemory(video::ILogicalDevice::SBindBufferMemoryInfo{.buffer = importedbuf.get(), .memory = devmemory.get() });
+			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } };
+			bool re = logicalDevice->bindBufferMemory(1, &bindInfo);
 			assert(re);
 		}
 
 		{
 			
-			video::IGPUImage::SCreationParams params = {};
-			params.type = video::IGPUImage::ET_2D;
-			params.samples = video::IGPUImage::ESCF_1_BIT;
-			params.format = asset::EF_R32_SFLOAT;
+			IGPUImage::SCreationParams params = {};
+			params.type = IGPUImage::ET_2D;
+			params.samples = IGPUImage::ESCF_1_BIT;
+			params.format = EF_R32_SFLOAT;
 			params.extent = { gridDim[0], blockDim[0], 1 };
 			params.mipLevels = 1;
 			params.arrayLayers = 1;
-			params.usage = video::IGPUImage::EUF_STORAGE_BIT | video::IGPUImage::EUF_TRANSFER_SRC_BIT;
-			params.externalHandleTypes = video::CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			params.tiling = video::IGPUImage::ET_LINEAR;
-			importedimg = mem[2]->exportAsImage(logicalDevice, std::move(params));
+			params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT;
+			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+			params.tiling = IGPUImage::TILING::LINEAR;
+			importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params));
 			assert(importedimg);
 		}
 		
-		fence = logicalDevice->createFence(video::IGPUFence::ECF_UNSIGNALED);
-		commandPool = logicalDevice->createCommandPool(queues[CommonAPI::InitOutput::EQT_COMPUTE]->getFamilyIndex(), {});
-		bool re = logicalDevice->createCommandBuffers(commandPool.get(), video::IGPUCommandBuffer::EL_PRIMARY, 1, &cmd);
+		commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {});
+		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger));
 		assert(re);
 
 		auto createStaging = [logicalDevice=logicalDevice]()
@@ -136,11 +222,11 @@ struct CUDA2VK
 			auto req = buf->getMemoryReqs();
 			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
 			auto allocation = logicalDevice->allocate(req, buf.get());
-			assert(allocation.memory && allocation.offset != video::ILogicalDevice::InvalidMemoryOffset);
-			assert(buf->getBoundMemory()->isMappable());
-			logicalDevice->mapMemory(video::IDeviceMemoryAllocation::MappedMemoryRange(buf->getBoundMemory(), buf->getBoundMemoryOffset(), req.size), video::IDeviceMemoryAllocation::EMCAF_READ);
-			assert(buf->getBoundMemory()->getMappedPointer());
-			memset(buf->getBoundMemory()->getMappedPointer(), 0, req.size);
+			assert(allocation.isValid() && buf->getBoundMemory().memory->isMappable());
+			
+			bool re = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+			assert(re && allocation.memory->getMappedPointer());
+			memset(allocation.memory->getMappedPointer(), 0, req.size);
 			return buf;
 		};
 
@@ -150,8 +236,6 @@ struct CUDA2VK
 
 	void launchKernel(CUfunction kernel, CUstream stream)
 	{
-		auto queue = queues[CommonAPI::InitOutput::EQT_COMPUTE];
-
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 		// Launch kernel
 		{
@@ -164,50 +248,60 @@ struct CUDA2VK
 			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream));
 			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream));
 			ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {};
+			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } };
 			auto semaphore = cusema->getInternalObject();
 			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+
+			std::string abc = "123";
 		}
 		
 		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
 		{
-			video::IGPUSemaphore* waitSemaphores[] = { sema.get() };
-			asset::E_PIPELINE_STAGE_FLAGS waitStages[] = { asset::EPSF_ALL_COMMANDS_BIT };
-			video::IGPUCommandBuffer* cmdBuffers[] = { cmd.get() };
-
-			video::IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = {
-				.barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
-				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
-				.dstQueueFamilyIndex = queue->getFamilyIndex(),
-				.buffer = importedbuf,
-				.offset = 0,
-				.size = size,
+			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+				.barrier = {
+					.dep = {
+						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+					},
+					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+					.otherQueueFamilyIndex = queue->getFamilyIndex(),
+				},
+				.range = { .buffer = importedbuf, },
 			};
 
 			bool re = true;
-			re &= cmd->begin(video::IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
-			/*Acquire?*/
-			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 1u, &bufBarrier, 0u, nullptr); 	// Ownership transfer?
-			asset::SBufferCopy region = { .size = size };
+			re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier}); 
+
+			IGPUCommandBuffer::SBufferCopy region = { .size = size };
 			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
 
-			video::IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = {
-				.barrier = { .dstAccessMask = asset::E_ACCESS_FLAGS::EAF_ALL_ACCESSES_BIT_DEVSH },
-				.oldLayout = asset::IImage::EL_PREINITIALIZED,
-				.newLayout = asset::IImage::EL_TRANSFER_SRC_OPTIMAL,
-				.srcQueueFamilyIndex = VK_QUEUE_FAMILY_EXTERNAL_KHR,
-				.dstQueueFamilyIndex = queue->getFamilyIndex(),
-				.image = importedimg,
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+				.barrier = { 
+					.dep = { 
+						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, 
+						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+					},
+					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+					.otherQueueFamilyIndex = queue->getFamilyIndex(),
+				},
+				.image = importedimg.get(),
 				.subresourceRange = {
-					.aspectMask = asset::IImage::EAF_COLOR_BIT,
+					.aspectMask = IImage::EAF_COLOR_BIT,
 					.levelCount = 1u,
 					.layerCount = 1u,
-				}
+				},
+				.oldLayout = IImage::LAYOUT::PREINITIALIZED,
+				.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
 			};
 
-			re &= cmd->pipelineBarrier(asset::EPSF_ALL_COMMANDS_BIT, asset::EPSF_ALL_COMMANDS_BIT, asset::EDF_NONE, 0u, nullptr, 0u, nullptr, 1u, &imgBarrier); 
+			re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier });
 
-			asset::IImage::SBufferCopy imgRegion = {
+			IImage::SBufferCopy imgRegion = {
 				.imageSubresource = {
 					.aspectMask = imgBarrier.subresourceRange.aspectMask,
 					.layerCount = imgBarrier.subresourceRange.layerCount,
@@ -217,16 +311,14 @@ struct CUDA2VK
 
 			re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion);
 			re &= cmd->end();
-
-			video::IGPUQueue::SSubmitInfo submitInfo = {
-				.waitSemaphoreCount = 1,
-				.pWaitSemaphores = waitSemaphores,
-				.pWaitDstStageMask = waitStages,
-				.commandBufferCount = 1,
-				.commandBuffers = cmdBuffers
+		
+			IQueue::SSubmitInfo submitInfo = {
+				.waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{ .semaphore = sema.get(), .value = 0, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }},
+				.commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}},
 			};
 
-			re &= queue->submit(1, &submitInfo, fence.get());
+			auto submitRe = queue->submit(std::array{submitInfo});;
+			re &= IQueue::RESULT::SUCCESS == submitRe;
 			assert(re);
 		}
         
@@ -237,15 +329,15 @@ struct CUDA2VK
 	{
 		// Make sure we are also done with the readback
 		{
-			video::IGPUFence* fences[] = { fence.get() };
-			auto status = logicalDevice->waitForFences(1, fences, true, -1);
-			assert(video::IGPUFence::ES_SUCCESS == status);
+			//IGPUFence* fences[] = { fence.get() };
+			//auto status = logicalDevice->waitForFences(1, fences, true, -1);
+			//assert(IGPUFence::ES_SUCCESS == status);
 		}
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory()->getMappedPointer());
-		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory()->getMappedPointer());
+		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory().memory->getMappedPointer());
+		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory().memory->getMappedPointer());
 
 		 assert(!memcmp(CBuf, CImg, size));
 
@@ -257,55 +349,63 @@ struct CUDA2VK
 		
 		std::cout << "Success\n";
 	}
-};
-
-int main(int argc, char** argv)
-{
-	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
-		.appName = { "63.CUDAInterop" },
-		.apiType = video::EAT_VULKAN, 
-		.swapchainImageUsage = nbl::asset::IImage::EUF_NONE,
-	});
-
-	auto& system = initOutput.system;
-	auto& apiConnection = initOutput.apiConnection;
-	auto& physicalDevice = initOutput.physicalDevice;
-	auto& logicalDevice = initOutput.logicalDevice;
-	auto& utilities = initOutput.utilities;
-	auto& queues = initOutput.queues;
-	auto& logger = initOutput.logger;
-
-	assert(physicalDevice->getLimits().externalMemory);
-	auto cudaHandler = video::CCUDAHandler::create(system.get(), core::smart_refctd_ptr<system::ILogger>(logger));
-	assert(cudaHandler);
-	auto cudaDevice = cudaHandler->createDevice(core::smart_refctd_ptr_dynamic_cast<video::CVulkanConnection>(apiConnection), physicalDevice);
-	auto& cu = cudaHandler->getCUDAFunctionTable();	
-
-	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-	CUmodule   module;
-	CUfunction kernel;
-	CUstream   stream;
-
-	{
-		system::ISystem::future_t<core::smart_refctd_ptr<system::IFile>> fut;
-		system->createFile(fut, "../vectorAdd_kernel.cu", system::IFileBase::ECF_READ);
-		auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
-		ASSERT_SUCCESS_NV(res);
-		ptx = std::move(ptx_);
-	}
 
-	ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-	ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-	ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+	// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+	bool keepRunning() override { return false; }
 
-	{
-		auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
-		cuda2vk.launchKernel(kernel, stream);
-		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-	}
-
-	ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-	ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-
-	return 0;
-}
+	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+	void workLoopBody() override {}
+};
+//
+//int main(int argc, char** argv)
+//{
+//	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
+//		.appName = { "63.CUDAInterop" },
+//		.apiType = EAT_VULKAN, 
+//		.swapchainImageUsage = IImage::EUF_NONE,
+//	});
+//
+//	auto& system = initOutput.system;
+//	auto& apiConnection = initOutput.apiConnection;
+//	auto& physicalDevice = initOutput.physicalDevice;
+//	auto& logicalDevice = initOutput.logicalDevice;
+//	auto& utilities = initOutput.utilities;
+//	auto& queues = initOutput.queues;
+//	auto& logger = initOutput.logger;
+//
+//	assert(physicalDevice->getLimits().externalMemory);
+//	auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(logger));
+//	assert(cudaHandler);
+//	auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(apiConnection), physicalDevice);
+//	auto& cu = cudaHandler->getCUDAFunctionTable();	
+//
+//	smart_refctd_ptr<ICPUBuffer> ptx;
+//	CUmodule   module;
+//	CUfunction kernel;
+//	CUstream   stream;
+//
+//	{
+//		ISystem::future_t<smart_refctd_ptr<IFile>> fut;
+//		system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
+//	/*	auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
+//		ASSERT_SUCCESS_NV(res);
+//		ptx = std::move(ptx_);*/
+//	}
+//
+//	//ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+//	//ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+//	//ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+//
+//	//{
+//	//	auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
+//	//	cuda2vk.launchKernel(kernel, stream);
+//	//	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+//	//}
+//
+//	//ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+//	//ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+//
+//	return 0;
+//}
+
+NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
index 65fadad4b..9526bb101 100644
--- a/common/MonoDeviceApplication.hpp
+++ b/common/MonoDeviceApplication.hpp
@@ -266,7 +266,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 		}
 
 		// This will most certainly be overriden
-		using queue_family_range_t = core::SRange<const video::IPhysicalDevice::SQueueFamilyProperties>;
+		using queue_family_range_t = std::span<const video::IPhysicalDevice::SQueueFamilyProperties>;
 		virtual core::vector<video::ILogicalDevice::SQueueCreationParams> getQueueCreationParameters(const queue_family_range_t& familyProperties)
 		{
 			using namespace video;
@@ -282,12 +282,12 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 		}
 
 		// virtual to allow aliasing and total flexibility
-		virtual video::IGPUQueue* getComputeQueue() const
+		virtual video::IQueue* getComputeQueue() const
 		{
 			// In the default implementation of everything I asked only for one queue from first compute family
 			const auto familyProperties = m_device->getPhysicalDevice()->getQueueFamilyProperties();
 			for (auto i=0u; i<familyProperties.size(); i++)
-			if (familyProperties[i].queueFlags.hasFlags(video::IPhysicalDevice::E_QUEUE_FLAGS::EQF_COMPUTE_BIT))
+			if (familyProperties[i].queueFlags.hasFlags(video::IQueue::FAMILY_FLAGS::COMPUTE_BIT))
 				return m_device->getQueue(i,0);
 
 			return nullptr;

From 415902575143a28cba08d677c73f1e917f3367cc Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Fri, 5 Jan 2024 01:01:21 +0300
Subject: [PATCH 10/17] fix sync

---
 63.CUDAInterop/main.cpp | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/63.CUDAInterop/main.cpp b/63.CUDAInterop/main.cpp
index 0b30b9227..11fae0d70 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63.CUDAInterop/main.cpp
@@ -251,8 +251,6 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } };
 			auto semaphore = cusema->getInternalObject();
 			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
-
-			std::string abc = "123";
 		}
 		
 		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
@@ -311,13 +309,16 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 
 			re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion);
 			re &= cmd->end();
-		
-			IQueue::SSubmitInfo submitInfo = {
-				.waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{ .semaphore = sema.get(), .value = 0, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }},
-				.commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}},
-			};
-
-			auto submitRe = queue->submit(std::array{submitInfo});;
+			
+			auto waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }};
+			auto signalSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }};
+			auto commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}};
+			auto submitInfo = std::array{IQueue::SSubmitInfo {
+				.waitSemaphores = waitSemaphores,
+				.commandBuffers = commandBuffers,
+				.signalSemaphores = signalSemaphores,
+			}};
+			auto submitRe = queue->submit(submitInfo);
 			re &= IQueue::RESULT::SUCCESS == submitRe;
 			assert(re);
 		}
@@ -328,11 +329,8 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 	void kernelCallback()
 	{
 		// Make sure we are also done with the readback
-		{
-			//IGPUFence* fences[] = { fence.get() };
-			//auto status = logicalDevice->waitForFences(1, fences, true, -1);
-			//assert(IGPUFence::ES_SUCCESS == status);
-		}
+		auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}};
+		logicalDevice->waitForSemaphores(wait, true, -1);
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
@@ -343,8 +341,8 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 
 		for (auto i = 0; i < numElements; i++)
 		{
-			assert(abs(CBuf[i] - A[i] - B[i]) < 0.01f);
-			assert(abs(CImg[i] - A[i] - B[i]) < 0.01f);
+			bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+			assert(re);
 		}
 		
 		std::cout << "Success\n";

From 386608b0afc0010d4be9e66b00b684fd7ce57dfa Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 13 Jan 2024 22:21:50 +0300
Subject: [PATCH 11/17] rename 63 example folder

---
 {63.CUDAInterop => 63_CUDAInterop}/CMakeLists.txt      | 0
 {63.CUDAInterop => 63_CUDAInterop}/main.cpp            | 2 +-
 {63.CUDAInterop => 63_CUDAInterop}/pipeline.groovy     | 0
 {63.CUDAInterop => 63_CUDAInterop}/vectorAdd_kernel.cu | 0
 4 files changed, 1 insertion(+), 1 deletion(-)
 rename {63.CUDAInterop => 63_CUDAInterop}/CMakeLists.txt (100%)
 rename {63.CUDAInterop => 63_CUDAInterop}/main.cpp (99%)
 rename {63.CUDAInterop => 63_CUDAInterop}/pipeline.groovy (100%)
 rename {63.CUDAInterop => 63_CUDAInterop}/vectorAdd_kernel.cu (100%)

diff --git a/63.CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt
similarity index 100%
rename from 63.CUDAInterop/CMakeLists.txt
rename to 63_CUDAInterop/CMakeLists.txt
diff --git a/63.CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
similarity index 99%
rename from 63.CUDAInterop/main.cpp
rename to 63_CUDAInterop/main.cpp
index 11fae0d70..8fa618e02 100644
--- a/63.CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -329,7 +329,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 	void kernelCallback()
 	{
 		// Make sure we are also done with the readback
-		auto wait = std::array{ILogicalDevice::SSemaphoreWaitInfo{.semaphore = sema.get(), .value = 2}};
+		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}};
 		logicalDevice->waitForSemaphores(wait, true, -1);
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
diff --git a/63.CUDAInterop/pipeline.groovy b/63_CUDAInterop/pipeline.groovy
similarity index 100%
rename from 63.CUDAInterop/pipeline.groovy
rename to 63_CUDAInterop/pipeline.groovy
diff --git a/63.CUDAInterop/vectorAdd_kernel.cu b/63_CUDAInterop/vectorAdd_kernel.cu
similarity index 100%
rename from 63.CUDAInterop/vectorAdd_kernel.cu
rename to 63_CUDAInterop/vectorAdd_kernel.cu

From f9a2d1be9fef023171aabbfc35776ce294c36bbf Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 14 Jan 2024 03:25:25 +0300
Subject: [PATCH 12/17] derive from monodevice

---
 63_CUDAInterop/main.cpp | 75 +++++++++--------------------------------
 1 file changed, 15 insertions(+), 60 deletions(-)

diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index 8fa618e02..9060907d9 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -9,7 +9,7 @@
 #include "nbl/video/CCUDASharedMemory.h"
 #include "nbl/video/CCUDASharedSemaphore.h"
 
-#include "../common./MonoSystemMonoLoggerApplication.hpp"
+#include "../common/MonoDeviceApplication.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -47,17 +47,16 @@ size_t size = sizeof(float) * numElements;
 static_assert(false);
 #endif
 
-class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
+class CUDA2VKApp : public examples::MonoDeviceApplication
 {
-	using base_t = examples::MonoSystemMonoLoggerApplication;
+	using base_t = examples::MonoDeviceApplication;
 public:
 	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
 	using base_t::base_t;
 
 	smart_refctd_ptr<CCUDAHandler> cudaHandler;
 	smart_refctd_ptr<CCUDADevice> cudaDevice;
-	// IUtilities* util;
-	smart_refctd_ptr<ILogicalDevice> logicalDevice;
+
 	IQueue* queue;
 
 	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpubuffers;
@@ -76,63 +75,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 		// Remember to call the base class initialization!
 		if (!base_t::onAppInitialized(std::move(system)))
 			return false;
-		// `system` could have been null (see the comments in `MonoSystemMonoLoggerApplication::onAppInitialized` as for why)
-		// use `MonoSystemMonoLoggerApplication::m_system` throughout the example instead!
-
-		// You should already know Vulkan and come here to save on the boilerplate, if you don't know what instances and instance extensions are, then find out.
-		smart_refctd_ptr<CVulkanConnection> api;
-		{
-			// You generally want to default initialize any parameter structs
-			IAPIConnection::SFeatures apiFeaturesToEnable = {};
-			// generally you want to make your life easier during development
-			apiFeaturesToEnable.validations = true;
-			apiFeaturesToEnable.synchronizationValidation = true;
-			// want to make sure we have this so we can name resources for vieweing in RenderDoc captures
-			apiFeaturesToEnable.debugUtils = true;
-			// create our Vulkan instance
-			if (!(api = CVulkanConnection::create(smart_refctd_ptr(m_system), 0, _NBL_APP_NAME_, smart_refctd_ptr(base_t::m_logger), apiFeaturesToEnable)))
-				return logFail("Failed to crate an IAPIConnection!");
-		}
-
-		// We won't go deep into performing physical device selection in this example, we'll take any device with a compute queue.
-		// Nabla has its own set of required baseline Vulkan features anyway, it won't report any device that doesn't meet them.
-		IPhysicalDevice* physDev = nullptr;
-		ILogicalDevice::SCreationParams params = {};
-		// we will only deal with a single queue in this example
-		params.queueParamsCount = 1;
-		params.queueParams[0].count = 1;
-		params.featuresToEnable;
-		for (auto physDevIt = api->getPhysicalDevices().begin(); physDevIt != api->getPhysicalDevices().end(); physDevIt++)
-		{
-			const auto familyProps = (*physDevIt)->getQueueFamilyProperties();
-			// this is the only "complicated" part, we want to create a queue that supports compute pipelines
-			for (auto i = 0; i < familyProps.size(); i++)
-				if (familyProps[i].queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT))
-				{
-					physDev = *physDevIt;
-					params.queueParams[0].familyIndex = i;
-					break;
-				}
-		}
-		if (!physDev)
-			return logFail("Failed to find any Physical Devices with Compute capable Queue Families!");
 
 		{
-			auto& limits = physDev->getLimits();
+			auto& limits = m_physicalDevice->getLimits();
 			if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32)
 				return logFail("Physical device does not support the required extensions");
 			
 			cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
 			assert(cudaHandler);
-			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(api), physDev);
+			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
 		}
 
-		// logical devices need to be created form physical devices which will actually let us create vulkan objects and use the physical device
-		logicalDevice = physDev->createLogicalDevice(std::move(params));
-		if (!logicalDevice)
-			return logFail("Failed to create a Logical Device!");
 		
-		queue = logicalDevice->getQueue(params.queueParams[0].familyIndex, 0);
+		queue = base_t::getComputeQueue();
 		
 		createResources();
 
@@ -180,19 +135,19 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		
-		sema = logicalDevice->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+		sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 		{
-			auto devmemory = mem[2]->exportAsMemory(logicalDevice.get());
+			auto devmemory = mem[2]->exportAsMemory(m_device.get());
 			assert(devmemory);
 			IGPUBuffer::SCreationParams params = {};
 			params.size = devmemory->getAllocationSize();
 			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			importedbuf = logicalDevice->createBuffer(std::move(params));
+			importedbuf = m_device->createBuffer(std::move(params));
 			assert(importedbuf);
 			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } };
-			bool re = logicalDevice->bindBufferMemory(1, &bindInfo);
+			bool re = m_device->bindBufferMemory(1, &bindInfo);
 			assert(re);
 		}
 
@@ -208,15 +163,15 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 			params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
 			params.tiling = IGPUImage::TILING::LINEAR;
-			importedimg = mem[2]->exportAsImage(logicalDevice.get(), std::move(params));
+			importedimg = mem[2]->exportAsImage(m_device.get(), std::move(params));
 			assert(importedimg);
 		}
 		
-		commandPool = logicalDevice->createCommandPool(queue->getFamilyIndex(), {});
+		commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {});
 		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger));
 		assert(re);
 
-		auto createStaging = [logicalDevice=logicalDevice]()
+		auto createStaging = [logicalDevice= m_device]()
 		{
 			auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
 			auto req = buf->getMemoryReqs();
@@ -330,7 +285,7 @@ class CUDA2VKApp : public examples::MonoSystemMonoLoggerApplication
 	{
 		// Make sure we are also done with the readback
 		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}};
-		logicalDevice->waitForSemaphores(wait, true, -1);
+		m_device->waitForSemaphores(wait, true, -1);
 
 		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
 		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());

From 138356a4a5e277859c9d156967c0187e45ca8e49 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sun, 14 Jan 2024 04:39:31 +0300
Subject: [PATCH 13/17] rewrite stuff

---
 63_CUDAInterop/main.cpp | 68 +++++------------------------------------
 1 file changed, 7 insertions(+), 61 deletions(-)

diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index 9060907d9..a14aac0af 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -115,6 +115,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 		ASSERT_SUCCESS(cu.pcuModuleUnload(module));
 		ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
 
+		m_device->waitIdle();
 		return true;
 	}
 
@@ -160,10 +161,10 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			params.extent = { gridDim[0], blockDim[0], 1 };
 			params.mipLevels = 1;
 			params.arrayLayers = 1;
-			params.usage = IGPUImage::EUF_STORAGE_BIT | IGPUImage::EUF_TRANSFER_SRC_BIT;
+			params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
 			params.tiling = IGPUImage::TILING::LINEAR;
-			importedimg = mem[2]->exportAsImage(m_device.get(), std::move(params));
+			importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params));
 			assert(importedimg);
 		}
 		
@@ -213,8 +214,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
 				.barrier = {
 					.dep = {
-						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
 						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 					},
@@ -226,7 +225,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 
 			bool re = true;
 			re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarrierCount = 1, .bufBarriers = &bufBarrier}); 
+			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} });
 
 			IGPUCommandBuffer::SBufferCopy region = { .size = size };
 			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
@@ -234,8 +233,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
 				.barrier = { 
 					.dep = { 
-						// .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-						// .srcAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, 
 						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
 					},
@@ -248,11 +245,11 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 					.levelCount = 1u,
 					.layerCount = 1u,
 				},
-				.oldLayout = IImage::LAYOUT::PREINITIALIZED,
+				.oldLayout = IImage::LAYOUT::UNDEFINED,
 				.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
 			};
 
-			re &= cmd->pipelineBarrier(EDF_NONE, {.imgBarrierCount = 1, .imgBarriers = &imgBarrier });
+			re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
 
 			IImage::SBufferCopy imgRegion = {
 				.imageSubresource = {
@@ -292,7 +289,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory().memory->getMappedPointer());
 		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory().memory->getMappedPointer());
 
-		 assert(!memcmp(CBuf, CImg, size));
+		assert(!memcmp(CBuf, CImg, size));
 
 		for (auto i = 0; i < numElements; i++)
 		{
@@ -309,56 +306,5 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
 	void workLoopBody() override {}
 };
-//
-//int main(int argc, char** argv)
-//{
-//	auto initOutput = CommonAPI::InitWithDefaultExt(CommonAPI::InitParams{
-//		.appName = { "63.CUDAInterop" },
-//		.apiType = EAT_VULKAN, 
-//		.swapchainImageUsage = IImage::EUF_NONE,
-//	});
-//
-//	auto& system = initOutput.system;
-//	auto& apiConnection = initOutput.apiConnection;
-//	auto& physicalDevice = initOutput.physicalDevice;
-//	auto& logicalDevice = initOutput.logicalDevice;
-//	auto& utilities = initOutput.utilities;
-//	auto& queues = initOutput.queues;
-//	auto& logger = initOutput.logger;
-//
-//	assert(physicalDevice->getLimits().externalMemory);
-//	auto cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(logger));
-//	assert(cudaHandler);
-//	auto cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(apiConnection), physicalDevice);
-//	auto& cu = cudaHandler->getCUDAFunctionTable();	
-//
-//	smart_refctd_ptr<ICPUBuffer> ptx;
-//	CUmodule   module;
-//	CUfunction kernel;
-//	CUstream   stream;
-//
-//	{
-//		ISystem::future_t<smart_refctd_ptr<IFile>> fut;
-//		system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
-//	/*	auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
-//		ASSERT_SUCCESS_NV(res);
-//		ptx = std::move(ptx_);*/
-//	}
-//
-//	//ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-//	//ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-//	//ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
-//
-//	//{
-//	//	auto cuda2vk = CUDA2VK(cudaHandler, cudaDevice, utilities.get(), logicalDevice.get(), queues.data());
-//	//	cuda2vk.launchKernel(kernel, stream);
-//	//	ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-//	//}
-//
-//	//ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-//	//ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-//
-//	return 0;
-//}
 
 NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file

From db2ac71cd2e1a8f63cd3f99ba46bb77616153798 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 19:19:27 +0300
Subject: [PATCH 14/17] reflect nabla changes

---
 63_CUDAInterop/main.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index a14aac0af..28c6d6f2f 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -77,10 +77,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			return false;
 
 		{
-			auto& limits = m_physicalDevice->getLimits();
-			if (!limits.externalMemoryWin32 || !limits.externalFenceWin32 || !limits.externalSemaphoreWin32)
-				return logFail("Physical device does not support the required extensions");
-			
 			cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
 			assert(cudaHandler);
 			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
@@ -136,7 +132,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		
-		sema = m_device->createSemaphore({ .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+		sema = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
 		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
 		{
 			auto devmemory = mem[2]->exportAsMemory(m_device.get());
@@ -154,7 +150,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 
 		{
 			
-			IGPUImage::SCreationParams params = {};
+			IImage::SCreationParams params = {};
 			params.type = IGPUImage::ET_2D;
 			params.samples = IGPUImage::ESCF_1_BIT;
 			params.format = EF_R32_SFLOAT;
@@ -162,8 +158,6 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			params.mipLevels = 1;
 			params.arrayLayers = 1;
 			params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			params.tiling = IGPUImage::TILING::LINEAR;
 			importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params));
 			assert(importedimg);
 		}

From 744dd44c3bd6d5bb5734402b85f49fd0e27a46cc Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Thu, 18 Jan 2024 23:27:02 +0300
Subject: [PATCH 15/17] address pr comments

---
 03_DeviceSelectionAndSharedSources/main.cpp   |   2 +-
 22_CppCompat/test.hlsl.orig                   | 264 ---------------
 63_CUDAInterop/CMakeLists.txt                 |  29 +-
 .../{ => app_resources}/vectorAdd_kernel.cu   |   2 +-
 63_CUDAInterop/main.cpp                       | 312 +++++++++++-------
 5 files changed, 221 insertions(+), 388 deletions(-)
 delete mode 100644 22_CppCompat/test.hlsl.orig
 rename 63_CUDAInterop/{ => app_resources}/vectorAdd_kernel.cu (99%)

diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index d9a8d9b1a..28872a5f5 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -32,7 +32,7 @@ class DeviceSelectionAndSharedSourcesApp final : public examples::MonoDeviceAppl
 		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(std::move(system)))
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
diff --git a/22_CppCompat/test.hlsl.orig b/22_CppCompat/test.hlsl.orig
deleted file mode 100644
index b67b08062..000000000
--- a/22_CppCompat/test.hlsl.orig
+++ /dev/null
@@ -1,264 +0,0 @@
-//// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-//// This file is part of the "Nabla Engine".
-//// For conditions of distribution and use, see copyright notice in nabla.h
-
-#pragma shader_stage(compute)
-
-#define STATIC_ASSERT(C) { nbl::hlsl::conditional<C, int, void>::type a = 0; }
-
-#define IS_SAME(L,R) nbl::hlsl::is_same<L,R>::value
-#define SHADER_CRASHING_ASSERT(expr) \
-{ \
-    bool con = (expr); \
-    do { \
-        [branch] if (!con) \
-          vk::RawBufferStore<uint32_t>(0xdeadbeefBADC0FFbull,0x45u,4u); \
-    } while(!con); \
-} 
-
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
-
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
-
-#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
-#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
-
-#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
-
-#include <nbl/builtin/hlsl/mpl.hlsl>
-#include <nbl/builtin/hlsl/bit.hlsl>
-
-#include <nbl/builtin/hlsl/limits.hlsl>
-
-struct PushConstants
-{
-	uint2 imgSize;
-};
-
-[[vk::push_constant]]
-PushConstants u_pushConstants;
-
-[[vk::binding(0, 0)]] Texture2D<float4> inImage;
-[[vk::binding(1, 0)]] RWTexture2D<float4> outImage;
-[[vk::binding(2, 0)]] Buffer<float4> inBuffer;
-[[vk::binding(3, 0)]] RWStructuredBuffer<float4> outBuffer;
-
-
-template<int A>
-struct Spec
-{
-    static const int value = Spec<A-1>::value + 1;
-};
-
-template<>
-struct Spec<0>
-{
-    static const int value = 0;
-};
-
-Buffer<float32_t4>  unbounded[];
-
-template<class T>
-bool val(T) { return nbl::hlsl::is_unbounded_array<T>::value; }
-
-
-template<typename T, uint32_t N>
-struct array
-{
-  T data[N];
-};
-
-void fill(uint3 invocationID, float val)
-{
-    outImage[invocationID.xy] = float4(val,val,val,val);
-    outBuffer[invocationID.x * invocationID.y] = float4(val,val,val,val);
-}
-
-void fill(uint3 invocationID, float4 val)
-{
-    outImage[invocationID.xy] = val;
-    outBuffer[invocationID.x * invocationID.y] = val;
-}
-
-
-[numthreads(8, 8, 1)]
-void main(uint3 invocationID : SV_DispatchThreadID)
-{
-    fill(invocationID, 1);
-    const float32_t3 TEST_VEC = float32_t3(1.0f, 2.0f, 3.0f);
-
-    fill(invocationID, 2);
-    // test functions from EOTF.hlsl
-    nbl::hlsl::colorspace::eotf::identity<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::impl_shared_2_4<float32_t3>(TEST_VEC, 0.5f);
-    nbl::hlsl::colorspace::eotf::sRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::Display_P3<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::DCI_P3_XYZ<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::SMPTE_170M<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::SMPTE_ST2084<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::HDR10_HLG<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::AdobeRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::Gamma_2_2<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::ACEScc<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::ACEScct<float32_t3>(TEST_VEC);
-    
-    fill(invocationID, 3);
-
-    // test functions from OETF.hlsl
-    nbl::hlsl::colorspace::oetf::identity<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::impl_shared_2_4<float32_t3>(TEST_VEC, 0.5f);
-    nbl::hlsl::colorspace::oetf::sRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::Display_P3<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::DCI_P3_XYZ<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::SMPTE_170M<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::SMPTE_ST2084<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::HDR10_HLG<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::AdobeRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::Gamma_2_2<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::ACEScc<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::ACEScct<float32_t3>(TEST_VEC);
-    
-    fill(invocationID, 4);
-    // xoroshiro tests
-    const uint32_t2 state = uint32_t2(12u, 34u);
-    nbl::hlsl::Xoroshiro64Star xoroshiro64Star = nbl::hlsl::Xoroshiro64Star::construct(state);
-    xoroshiro64Star();
-    nbl::hlsl::Xoroshiro64StarStar xoroshiro64StarStar = nbl::hlsl::Xoroshiro64StarStar::construct(state);
-    xoroshiro64StarStar();
-    
-    //nbl::hlsl::mpl::countl_zero<2ull>::value;
-    
-    // TODO: test if std::rotl/r == nbl::hlsl::rotr/l == nbl::hlsl::mpl::rotr/l
-    
-    // uint32_t mplRotlResult0 = nbl::hlsl::mpl::rotl<uint32_t, 2u, 1>::value;
-    // uint32_t mplRotlResult1 = nbl::hlsl::mpl::rotl<uint32_t, 2u, -1>::value;
-    // uint32_t mplRotrResult0 = nbl::hlsl::mpl::rotr<uint32_t, 2u, 1>::value;
-    // uint32_t mplRotrResult1 = nbl::hlsl::mpl::rotr<uint32_t, 2u, -1>::value;
-    
-    // uint32_t rotlResult0 = nbl::hlsl::mpl::rotl<uint32_t, 2u, 1>::value;
-    // uint32_t rotlResult1 = nbl::hlsl::mpl::rotl<uint32_t, 2u, -1>::value;
-    // uint32_t rotrResult0 = nbl::hlsl::mpl::rotr<uint32_t, 2u, 1>::value;
-    // uint32_t rotrResult1 = nbl::hlsl::mpl::rotr<uint32_t, 2u, -1>::value;
-    
-    // SHADER_CRASHING_ASSERT(rotlResult0 == mplRotlResult0);
-    // SHADER_CRASHING_ASSERT(rotlResult1 == mplRotlResult1);
-    // SHADER_CRASHING_ASSERT(rotrResult0 == mplRotrResult0);
-    // SHADER_CRASHING_ASSERT(rotrResult1 == mplRotrResult1);
-
-    // TODO: more tests and compare with cpp version as well
-    fill(invocationID, 5);
-    // countl_zero test
-    {
-        static const uint16_t TEST_VALUE_0 = 5;
-        static const uint32_t TEST_VALUE_1 = 0x80000000u;
-        static const uint32_t TEST_VALUE_2 = 0x8000000000000000u;
-        static const uint32_t TEST_VALUE_3 = 0x00000001u;
-        static const uint32_t TEST_VALUE_4 = 0x0000000000000001u;
-        
-
-        fill(invocationID, 5.01);
-        uint16_t compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint16_t, TEST_VALUE_0>::value;
-        uint16_t runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_0);
-        fill(invocationID, float4(5.1, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_1>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_1);
-        fill(invocationID, float4(5.2, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_2>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_2);
-        fill(invocationID, float4(5.3, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_3>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_3);
-        fill(invocationID, float4(5.4, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_4>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_4);
-        fill(invocationID, float4(5.5, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-    }
-
-    {
-        bool A = Spec<3>::value == 3;
-    }
-    {
-        bool A = nbl::hlsl::is_integral<int>::value;
-    }
-    {
-        bool A = val(unbounded);
-    }
-    fill(invocationID, 6);
-    {
-        float4 v;
-        fill(invocationID, float4(alignof(v.x), alignof(v), 0, 0));
-        SHADER_CRASHING_ASSERT(alignof(v.x) == alignof(v));
-    }
-    
-    {
-        float4 v;
-        const volatile float4 u;
-
-        STATIC_ASSERT(IS_SAME(decltype(v.x), nbl::hlsl::impl::add_lvalue_reference<decltype(v.x)>::type));
-        STATIC_ASSERT(nbl::hlsl::impl::is_reference<decltype(v.x)>::value);
-        STATIC_ASSERT(IS_SAME(float,nbl::hlsl::impl::remove_reference<decltype(v.x)>::type));
-        STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference<float>::type));
-        STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference<nbl::hlsl::impl::remove_reference<decltype(v.x)>::type>::type));
-        
-        STATIC_ASSERT(IS_SAME(float,nbl::hlsl::remove_cvref<decltype(v.x)>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv<decltype(v.x)>::type,nbl::hlsl::impl::add_lvalue_reference<float>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv<decltype(v.x)>::type,nbl::hlsl::impl::add_lvalue_reference<nbl::hlsl::remove_cvref<decltype(v.x)>::type>::type));
-    }
-    fill(invocationID, 7);
-    {
-        float x[4][4];
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_extent<decltype(x)>::type, float[4]));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_all_extents<decltype(x)>::type, float));
-    }
-    fill(invocationID, 8);
-    {
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int16_t>::type,   nbl::hlsl::make_signed<uint16_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int16_t>::type, nbl::hlsl::make_unsigned<uint16_t>::type));
-
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int32_t>::type,   nbl::hlsl::make_signed<uint32_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int32_t>::type, nbl::hlsl::make_unsigned<uint32_t>::type));
-
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int64_t>::type,   nbl::hlsl::make_signed<uint64_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int64_t>::type, nbl::hlsl::make_unsigned<uint64_t>::type));
-    }
-
-    {
-        int Q[3][4][5];
-        STATIC_ASSERT(3 == (nbl::hlsl::extent<decltype(Q), 0>::value));
-        STATIC_ASSERT(4 == (nbl::hlsl::extent<decltype(Q), 1>::value));
-        STATIC_ASSERT(5 == (nbl::hlsl::extent<decltype(Q), 2>::value));
-        STATIC_ASSERT(0 == (nbl::hlsl::extent<decltype(Q), 3>::value));
-    }
-    fill(invocationID, 9);
-    {
-        float32_t a;
-
-        a = nbl::hlsl::numeric_limits<float32_t>::min;
-        a = nbl::hlsl::numeric_limits<float32_t>::max;
-        a = nbl::hlsl::numeric_limits<float32_t>::lowest;
-        a = nbl::hlsl::numeric_limits<float32_t>::epsilon;
-        a = nbl::hlsl::numeric_limits<float32_t>::round_error;
-        a = nbl::hlsl::numeric_limits<float32_t>::infinity;
-        a = nbl::hlsl::numeric_limits<float32_t>::quiet_NaN;
-        a = nbl::hlsl::numeric_limits<float32_t>::signaling_NaN;
-        a = nbl::hlsl::numeric_limits<float32_t>::denorm_min;
-    }
-
-    fill(invocationID, 10);
-
-    fill(invocationID, -1);
-}
diff --git a/63_CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt
index 2f8f8439f..bc1624875 100644
--- a/63_CUDAInterop/CMakeLists.txt
+++ b/63_CUDAInterop/CMakeLists.txt
@@ -1,17 +1,24 @@
-
 include(common RESULT_VARIABLE RES)
 if(NOT RES)
 	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
 endif()
 
-set(CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS
-	${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-)
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
-nbl_create_executable_project(
-	""
-	""
-	"${CUDA_INTEROP_EXAMPLE_INCLUDE_DIRS}"
-	""
-	"${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}"
-)
\ No newline at end of file
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/63_CUDAInterop/vectorAdd_kernel.cu b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu
similarity index 99%
rename from 63_CUDAInterop/vectorAdd_kernel.cu
rename to 63_CUDAInterop/app_resources/vectorAdd_kernel.cu
index 3baef0123..99c831121 100644
--- a/63_CUDAInterop/vectorAdd_kernel.cu
+++ b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu
@@ -39,4 +39,4 @@ extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
   if (i < numElements) {
     C[i] = A[i] + B[i];
   }
-}
\ No newline at end of file
+}
diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index 28c6d6f2f..af1ec82cf 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -2,14 +2,12 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#define _NBL_STATIC_LIB_
-#include <nabla.h>
-
 #include "nbl/video/CCUDAHandler.h"
 #include "nbl/video/CCUDASharedMemory.h"
 #include "nbl/video/CCUDASharedSemaphore.h"
 
 #include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
 
 using namespace nbl;
 using namespace core;
@@ -22,77 +20,115 @@ The start of the main function starts like in most other example. We ask the
 user for the desired renderer and start it up.
 */
 
-#define ASSERT_SUCCESS(expr) \
-if (auto re = expr; CUDA_SUCCESS != re) { \
-	const char* name = 0, *str = 0; \
-	cu.pcuGetErrorName(re, &name); \
-	cu.pcuGetErrorString(re, &str); \
-	printf("%s:%d %s:\n\t%s\n", __FILE__, __LINE__, name, str); \
-	abort(); \
+bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
+{
+	if (auto re = err; CUDA_SUCCESS != re) 
+	{
+		const char* name = 0, * str = 0;
+		cu.pcuGetErrorName(re, &name);
+		cu.pcuGetErrorString(re, &str);
+		logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
+		return false;
+	}
+	return true;
 }
 
-#define ASSERT_SUCCESS_NV(expr) \
-if (auto re = expr; NVRTC_SUCCESS != re) { \
-	const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); \
-	printf("%s:%d %s\n", __FILE__, __LINE__, str); \
-	abort(); \
+bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
+{
+	if (auto re = err; NVRTC_SUCCESS != re) 
+	{
+		const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+		logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
+		return false;
+	}
+	return true;
 }
 
-constexpr uint32_t gridDim[3] = { 4096,1,1 };
-constexpr uint32_t blockDim[3] = { 1024,1,1 };
-size_t numElements = gridDim[0] * blockDim[0];
-size_t size = sizeof(float) * numElements;
+#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
+#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 #ifndef _NBL_COMPILE_WITH_CUDA_
 static_assert(false);
 #endif
 
-class CUDA2VKApp : public examples::MonoDeviceApplication
+class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using base_t = examples::MonoDeviceApplication;
+	using device_base_t = examples::MonoDeviceApplication;
+	using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+	static constexpr uint32_t gridDim[3] = { 4096,1,1 };
+	static constexpr uint32_t blockDim[3] = { 1024,1,1 };
+	static constexpr size_t numElements = gridDim[0] * blockDim[0];
+	static constexpr size_t size = sizeof(float) * numElements;
 public:
-	// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
-	using base_t::base_t;
+	// Yay thanks to multiple inheritance we cannot forward ctors anymore
+	CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
 	smart_refctd_ptr<CCUDAHandler> cudaHandler;
 	smart_refctd_ptr<CCUDADevice> cudaDevice;
 
 	IQueue* queue;
 
-	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpubuffers;
-	std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> mem = {};
-	smart_refctd_ptr<CCUDASharedSemaphore> cusema;
+	// CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
+	// CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+	// Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
+	std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+	// A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer
+	smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
 
-	smart_refctd_ptr<IGPUBuffer> importedbuf, stagingbuf, stagingbuf2;
-	smart_refctd_ptr<IGPUImage> importedimg;
-	smart_refctd_ptr<ISemaphore> sema;
-	smart_refctd_ptr<IGPUCommandPool> commandPool;
-	smart_refctd_ptr<IGPUCommandBuffer> cmd;
+	// our Buffer that is bound to cudaMemories[2]
+	smart_refctd_ptr<IGPUBuffer> importedBuf;
+	// our Image that is also bound to cudaMemories[2]
+	smart_refctd_ptr<IGPUImage> importedImg;
 
+	// host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
+	smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+
+	// Nabla semaphore that aliases the cudaSemaphore above
+	smart_refctd_ptr<ISemaphore> importedSemaphore;
+
+	smart_refctd_ptr<IGPUCommandPool> commandPool;
+	smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
 
 	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 	{
-		// Remember to call the base class initialization!
-		if (!base_t::onAppInitialized(std::move(system)))
-			return false;
-
+        // Remember to call the base class initialization!
+        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+        if (!asset_base_t::onAppInitialized(std::move(system)))
+            return false;
+        
 		{
-			cudaHandler = CCUDAHandler::create(system.get(), smart_refctd_ptr<ILogger>(m_logger));
-			assert(cudaHandler);
+			cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+			if (!cudaHandler) return logFail("Could not create a CUDA handler!");
 			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+			if (!cudaDevice) return logFail("Could not create a CUDA Device!");
 		}
 
 		
-		queue = base_t::getComputeQueue();
+		queue = device_base_t::getComputeQueue();
 		
 		createResources();
 
 		smart_refctd_ptr<ICPUBuffer> ptx;
 		{
-			ISystem::future_t<smart_refctd_ptr<IFile>> fut;
-			m_system->createFile(fut, "../vectorAdd_kernel.cu", IFileBase::ECF_READ);
-			auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(fut.copy().get(), cudaDevice->geDefaultCompileOptions());
-			ASSERT_SUCCESS_NV(res);
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = ""; // virtual root
+			// this time we load a shader directly from a file
+			auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
+				return logFail("Could not load kernel!");
+
+			smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+			std::string log;
+			auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+				"app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+			ASSERT_SUCCESS_NV(res, log);
+
 			ptx = std::move(ptx_);
 		}
 		CUmodule   module;
@@ -119,37 +155,46 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 	{
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 
-		for (auto& buf : cpubuffers)
+		for (auto& buf : cpuBufs)
 			buf = make_smart_refctd_ptr<ICPUBuffer>(size);
 
 		for (auto j = 0; j < 2; j++)
 			for (auto i = 0; i < numElements; i++)
-				reinterpret_cast<float*>(cpubuffers[j]->getPointer())[i] = rand() / float(RAND_MAX);
-
+				reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
 
 
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&mem[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		// create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
 		
-		sema = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cusema, sema.get()));
+		importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get()));
 		{
-			auto devmemory = mem[2]->exportAsMemory(m_device.get());
-			assert(devmemory);
+			// export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+			auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+			if (!devmemory)
+				logFail("Failed to export CUDA memory!");
+
+
+			// create an importing external buffer on Nabla side
 			IGPUBuffer::SCreationParams params = {};
 			params.size = devmemory->getAllocationSize();
 			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
 			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			importedbuf = m_device->createBuffer(std::move(params));
-			assert(importedbuf);
-			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedbuf.get(), .binding = {.memory = devmemory.get() } };
+			importedBuf = m_device->createBuffer(std::move(params));
+			if (!importedBuf) logFail("Failed to create an external buffer");
+
+			// bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
 			bool re = m_device->bindBufferMemory(1, &bindInfo);
-			assert(re);
+			if (!re) logFail("Failed to bind CUDA memory to buffer");
 		}
 
 		{
-			
+			// same thing as above
+			// we create an external image and bind the imported external memory to it
+			// now we have 2 different resources that are bound to the same memory
 			IImage::SCreationParams params = {};
 			params.type = IGPUImage::ET_2D;
 			params.samples = IGPUImage::ESCF_1_BIT;
@@ -158,48 +203,89 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			params.mipLevels = 1;
 			params.arrayLayers = 1;
 			params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-			importedimg = mem[2]->createAndBindImage(m_device.get(), std::move(params));
-			assert(importedimg);
+			importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
+			if (!importedImg) logFail("Failed to create an external image");
 		}
 		
-		commandPool = m_device->createCommandPool(queue->getFamilyIndex(), {});
-		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd, smart_refctd_ptr(m_logger));
-		assert(re);
+		commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
 
-		auto createStaging = [logicalDevice= m_device]()
+		auto createStaging = [this,logicalDevice= m_device]()
 		{
 			auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
 			auto req = buf->getMemoryReqs();
 			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
 			auto allocation = logicalDevice->allocate(req, buf.get());
-			assert(allocation.isValid() && buf->getBoundMemory().memory->isMappable());
 			
-			bool re = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-			assert(re && allocation.memory->getMappedPointer());
-			memset(allocation.memory->getMappedPointer(), 0, req.size);
+			void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+			if (!mapping) 
+				logFail("Failed to map an staging buffer");
+			memset(mapping, 0, req.size);
 			return buf;
 		};
 
-		stagingbuf = createStaging();
-		stagingbuf2 = createStaging();
+		stagingBufs[0] = createStaging();
+		stagingBufs[1] = createStaging();
 	}
 
 	void launchKernel(CUfunction kernel, CUstream stream)
 	{
+
+		// First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+		{
+			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+					.barrier = {
+						.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+						.otherQueueFamilyIndex = IQueue::FamilyExternal,
+					},
+					.range = {.buffer = importedBuf, },
+			};
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+				.barrier = {
+					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+					.otherQueueFamilyIndex = IQueue::FamilyExternal,
+				},
+				.image = importedImg.get(),
+				.subresourceRange = {
+					.aspectMask = IImage::EAF_COLOR_BIT,
+					.levelCount = 1u,
+					.layerCount = 1u,
+				}
+			};
+			// start recording
+			bool re = true;
+			re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+			re &= cmd[0]->end();
+
+			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 };
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+			IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+			auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+			re &= IQueue::RESULT::SUCCESS == submitRe;
+			if (!re)
+				logFail("Something went wrong readying resources for CUDA");
+		}
+		
 		auto& cu = cudaHandler->getCUDAFunctionTable();
 		// Launch kernel
 		{
 			CUdeviceptr ptrs[] = {
-				mem[0]->getDeviceptr(),
-				mem[1]->getDeviceptr(),
-				mem[2]->getDeviceptr(),
+				cudaMemories[0]->getDeviceptr(),
+				cudaMemories[1]->getDeviceptr(),
+				cudaMemories[2]->getDeviceptr(),
 			};
-			void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpubuffers[0]->getPointer(), size, stream));
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpubuffers[1]->getPointer(), size, stream));
+			auto numEles = numElements;
+			void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+
+			auto semaphore = cudaSemaphore->getInternalObject();
+			CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+			ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
 			ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 1 } } };
-			auto semaphore = cusema->getInternalObject();
+			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
 			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
 		}
 		
@@ -208,65 +294,66 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
 				.barrier = {
 					.dep = {
-						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+						.dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
 					},
 					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-					.otherQueueFamilyIndex = queue->getFamilyIndex(),
+					.otherQueueFamilyIndex = IQueue::FamilyExternal,
 				},
-				.range = { .buffer = importedbuf, },
+				.range = { .buffer = importedBuf, },
 			};
-
 			bool re = true;
-			re &= cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			re &= cmd->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} });
+			re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
 
 			IGPUCommandBuffer::SBufferCopy region = { .size = size };
-			re &= cmd->copyBuffer(importedbuf.get(), stagingbuf.get(), 1, &region);
+			re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
 
 			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
 				.barrier = { 
 					.dep = { 
-						.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, 
-						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS | ACCESS_FLAGS::MEMORY_WRITE_BITS,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
 					},
 					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-					.otherQueueFamilyIndex = queue->getFamilyIndex(),
+					.otherQueueFamilyIndex = IQueue::FamilyExternal,
 				},
-				.image = importedimg.get(),
+				.image = importedImg.get(),
 				.subresourceRange = {
 					.aspectMask = IImage::EAF_COLOR_BIT,
 					.levelCount = 1u,
 					.layerCount = 1u,
 				},
-				.oldLayout = IImage::LAYOUT::UNDEFINED,
+				.oldLayout = IImage::LAYOUT::PREINITIALIZED,
 				.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
 			};
 
-			re &= cmd->pipelineBarrier(EDF_NONE, { .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+			re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
 
 			IImage::SBufferCopy imgRegion = {
 				.imageSubresource = {
 					.aspectMask = imgBarrier.subresourceRange.aspectMask,
 					.layerCount = imgBarrier.subresourceRange.layerCount,
 				},
-				.imageExtent = importedimg->getCreationParameters().extent,
+				.imageExtent = importedImg->getCreationParameters().extent,
 			};
 
-			re &= cmd->copyImageToBuffer(importedimg.get(), imgBarrier.newLayout, stagingbuf2.get(), 1, &imgRegion);
-			re &= cmd->end();
+			re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+			re &= cmd[1]->end();
 			
-			auto waitSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }};
-			auto signalSemaphores = std::array{IQueue::SSubmitInfo::SSemaphoreInfo{.semaphore = sema.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }};
-			auto commandBuffers = std::array{IQueue::SSubmitInfo::SCommandBufferInfo{cmd.get()}};
-			auto submitInfo = std::array{IQueue::SSubmitInfo {
-				.waitSemaphores = waitSemaphores,
-				.commandBuffers = commandBuffers,
-				.signalSemaphores = signalSemaphores,
-			}};
-			auto submitRe = queue->submit(submitInfo);
+			IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 };
+			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 };
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+			IQueue::SSubmitInfo submitInfo = { 
+				.waitSemaphores = {&waitInfo,&waitInfo + 1},
+				.commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+				.signalSemaphores = {&signalInfo,&signalInfo + 1} 
+			};
+			auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
 			re &= IQueue::RESULT::SUCCESS == submitRe;
-			assert(re);
+			if (!re)
+				logFail("Something went wrong copying results from CUDA");
 		}
         
 		ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
@@ -275,20 +362,23 @@ class CUDA2VKApp : public examples::MonoDeviceApplication
 	void kernelCallback()
 	{
 		// Make sure we are also done with the readback
-		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = sema.get(), .value = 2}};
+		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}};
 		m_device->waitForSemaphores(wait, true, -1);
 
-		float* A = reinterpret_cast<float*>(cpubuffers[0]->getPointer());
-		float* B = reinterpret_cast<float*>(cpubuffers[1]->getPointer());
-		float* CBuf = reinterpret_cast<float*>(stagingbuf->getBoundMemory().memory->getMappedPointer());
-		float* CImg = reinterpret_cast<float*>(stagingbuf2->getBoundMemory().memory->getMappedPointer());
+		float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+		float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+
+		float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+		float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
 
-		assert(!memcmp(CBuf, CImg, size));
+		if(memcmp(CBuf, CImg, size))
+			logFail("Buffer and Image memories do not match!");
 
 		for (auto i = 0; i < numElements; i++)
 		{
 			bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-			assert(re);
+			if(!re)
+				logFail("Element at index %d is incorrect!", i);
 		}
 		
 		std::cout << "Success\n";

From 59ebd29b4eaf8ce530fd75b0e17478e59b7310d2 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Fri, 19 Jan 2024 01:11:33 +0300
Subject: [PATCH 16/17] filter cuda compatible devices

---
 63_CUDAInterop/main.cpp | 701 ++++++++++++++++++++--------------------
 1 file changed, 357 insertions(+), 344 deletions(-)

diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index af1ec82cf..1d67dbee3 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -22,26 +22,26 @@ user for the desired renderer and start it up.
 
 bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
 {
-	if (auto re = err; CUDA_SUCCESS != re) 
-	{
-		const char* name = 0, * str = 0;
-		cu.pcuGetErrorName(re, &name);
-		cu.pcuGetErrorString(re, &str);
-		logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
-		return false;
-	}
-	return true;
+    if (auto re = err; CUDA_SUCCESS != re) 
+    {
+        const char* name = 0, * str = 0;
+        cu.pcuGetErrorName(re, &name);
+        cu.pcuGetErrorString(re, &str);
+        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
+        return false;
+    }
+    return true;
 }
 
 bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
 {
-	if (auto re = err; NVRTC_SUCCESS != re) 
-	{
-		const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
-		logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
-		return false;
-	}
-	return true;
+    if (auto re = err; NVRTC_SUCCESS != re) 
+    {
+        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
+        return false;
+    }
+    return true;
 }
 
 #define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
@@ -53,342 +53,355 @@ static_assert(false);
 
 class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
 {
-	using device_base_t = examples::MonoDeviceApplication;
-	using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+    using device_base_t = examples::MonoDeviceApplication;
+    using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
 
-	static constexpr uint32_t gridDim[3] = { 4096,1,1 };
-	static constexpr uint32_t blockDim[3] = { 1024,1,1 };
-	static constexpr size_t numElements = gridDim[0] * blockDim[0];
-	static constexpr size_t size = sizeof(float) * numElements;
+    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
+    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
+    static constexpr size_t numElements = gridDim[0] * blockDim[0];
+    static constexpr size_t size = sizeof(float) * numElements;
 public:
-	// Yay thanks to multiple inheritance we cannot forward ctors anymore
-	CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-	smart_refctd_ptr<CCUDAHandler> cudaHandler;
-	smart_refctd_ptr<CCUDADevice> cudaDevice;
-
-	IQueue* queue;
-
-	// CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
-	std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
-	// CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
-	// Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
-	std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
-	// A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer
-	smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
-
-	// our Buffer that is bound to cudaMemories[2]
-	smart_refctd_ptr<IGPUBuffer> importedBuf;
-	// our Image that is also bound to cudaMemories[2]
-	smart_refctd_ptr<IGPUImage> importedImg;
+    // Yay thanks to multiple inheritance we cannot forward ctors anymore
+    CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+    smart_refctd_ptr<CCUDAHandler> cudaHandler;
+    smart_refctd_ptr<CCUDADevice> cudaDevice;
+
+    IQueue* queue;
+
+    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
+    // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+    // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
+    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer
+    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+
+    // our Buffer that is bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUBuffer> importedBuf;
+    // our Image that is also bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUImage> importedImg;
+
+    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
+    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+
+    // Nabla semaphore that aliases the cudaSemaphore above
+    smart_refctd_ptr<ISemaphore> importedSemaphore;
+
+    smart_refctd_ptr<IGPUCommandPool> commandPool;
+    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
+
+    // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
+    core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const override
+    {
+        auto devices = device_base_t::filterDevices(physicalDevices);
+        auto& cuDevices = cudaHandler->getAvailableDevices();
+        std::erase_if(devices, [&cuDevices](auto pdev) {
+            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
+        });
+        return devices;
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
 
-	// host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
-	smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+        cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+        if (!cudaHandler) 
+            return logFail("Could not create a CUDA handler!");
 
-	// Nabla semaphore that aliases the cudaSemaphore above
-	smart_refctd_ptr<ISemaphore> importedSemaphore;
+        if (!device_base_t::onAppInitialized(std::move(system)))
+            return false;
 
-	smart_refctd_ptr<IGPUCommandPool> commandPool;
-	smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
+        cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+        if (!cudaDevice) 
+            return logFail("Could not create a CUDA Device!");
 
-	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-	{
-        // Remember to call the base class initialization!
-        if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-            return false;
-        if (!asset_base_t::onAppInitialized(std::move(system)))
-            return false;
         
-		{
-			cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
-			if (!cudaHandler) return logFail("Could not create a CUDA handler!");
-			cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
-			if (!cudaDevice) return logFail("Could not create a CUDA Device!");
-		}
-
-		
-		queue = device_base_t::getComputeQueue();
-		
-		createResources();
-
-		smart_refctd_ptr<ICPUBuffer> ptx;
-		{
-			IAssetLoader::SAssetLoadParams lp = {};
-			lp.logger = m_logger.get();
-			lp.workingDirectory = ""; // virtual root
-			// this time we load a shader directly from a file
-			auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
-			const auto assets = assetBundle.getContents();
-			if (assets.empty())
-				return logFail("Could not load kernel!");
-
-			smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
-			std::string log;
-			auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
-				"app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-			ASSERT_SUCCESS_NV(res, log);
-
-			ptx = std::move(ptx_);
-		}
-		CUmodule   module;
-		CUfunction kernel;
-		CUstream   stream;
-
-		auto& cu = cudaHandler->getCUDAFunctionTable();
-
-		ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-		ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-		ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
-
-		launchKernel(kernel, stream);
-
-		ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-		ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-		ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-
-		m_device->waitIdle();
-		return true;
-	}
-
-	void createResources()
-	{
-		auto& cu = cudaHandler->getCUDAFunctionTable();
-
-		for (auto& buf : cpuBufs)
-			buf = make_smart_refctd_ptr<ICPUBuffer>(size);
-
-		for (auto j = 0; j < 2; j++)
-			for (auto i = 0; i < numElements; i++)
-				reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
-
-
-		// create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-		ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-		
-		importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-		ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get()));
-		{
-			// export the CUmem we have just created into a refctd IDeviceMemoryAllocation
-			auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-			if (!devmemory)
-				logFail("Failed to export CUDA memory!");
-
-
-			// create an importing external buffer on Nabla side
-			IGPUBuffer::SCreationParams params = {};
-			params.size = devmemory->getAllocationSize();
-			params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-			params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-			importedBuf = m_device->createBuffer(std::move(params));
-			if (!importedBuf) logFail("Failed to create an external buffer");
-
-			// bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-			ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
-			bool re = m_device->bindBufferMemory(1, &bindInfo);
-			if (!re) logFail("Failed to bind CUDA memory to buffer");
-		}
-
-		{
-			// same thing as above
-			// we create an external image and bind the imported external memory to it
-			// now we have 2 different resources that are bound to the same memory
-			IImage::SCreationParams params = {};
-			params.type = IGPUImage::ET_2D;
-			params.samples = IGPUImage::ESCF_1_BIT;
-			params.format = EF_R32_SFLOAT;
-			params.extent = { gridDim[0], blockDim[0], 1 };
-			params.mipLevels = 1;
-			params.arrayLayers = 1;
-			params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-			importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
-			if (!importedImg) logFail("Failed to create an external image");
-		}
-		
-		commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-		bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
-
-		auto createStaging = [this,logicalDevice= m_device]()
-		{
-			auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-			auto req = buf->getMemoryReqs();
-			req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-			auto allocation = logicalDevice->allocate(req, buf.get());
-			
-			void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-			if (!mapping) 
-				logFail("Failed to map an staging buffer");
-			memset(mapping, 0, req.size);
-			return buf;
-		};
-
-		stagingBufs[0] = createStaging();
-		stagingBufs[1] = createStaging();
-	}
-
-	void launchKernel(CUfunction kernel, CUstream stream)
-	{
-
-		// First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
-		{
-			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-					.barrier = {
-						.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-						.otherQueueFamilyIndex = IQueue::FamilyExternal,
-					},
-					.range = {.buffer = importedBuf, },
-			};
-
-			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-				.barrier = {
-					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-					.otherQueueFamilyIndex = IQueue::FamilyExternal,
-				},
-				.image = importedImg.get(),
-				.subresourceRange = {
-					.aspectMask = IImage::EAF_COLOR_BIT,
-					.levelCount = 1u,
-					.layerCount = 1u,
-				}
-			};
-			// start recording
-			bool re = true;
-			re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
-			re &= cmd[0]->end();
-
-			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 };
-			IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
-			IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
-			auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-			re &= IQueue::RESULT::SUCCESS == submitRe;
-			if (!re)
-				logFail("Something went wrong readying resources for CUDA");
-		}
-		
-		auto& cu = cudaHandler->getCUDAFunctionTable();
-		// Launch kernel
-		{
-			CUdeviceptr ptrs[] = {
-				cudaMemories[0]->getDeviceptr(),
-				cudaMemories[1]->getDeviceptr(),
-				cudaMemories[2]->getDeviceptr(),
-			};
-			auto numEles = numElements;
-			void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
-			ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
-
-			auto semaphore = cudaSemaphore->getInternalObject();
-			CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-			ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-			ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-			CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-			ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
-		}
-		
-		// After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
-		{
-			IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-				.barrier = {
-					.dep = {
-						.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-						.dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
-					},
-					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-					.otherQueueFamilyIndex = IQueue::FamilyExternal,
-				},
-				.range = { .buffer = importedBuf, },
-			};
-			bool re = true;
-			re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-			re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
-
-			IGPUCommandBuffer::SBufferCopy region = { .size = size };
-			re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
-
-			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-				.barrier = { 
-					.dep = { 
-						.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-						.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
-					},
-					.ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-					.otherQueueFamilyIndex = IQueue::FamilyExternal,
-				},
-				.image = importedImg.get(),
-				.subresourceRange = {
-					.aspectMask = IImage::EAF_COLOR_BIT,
-					.levelCount = 1u,
-					.layerCount = 1u,
-				},
-				.oldLayout = IImage::LAYOUT::PREINITIALIZED,
-				.newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
-			};
-
-			re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
-
-			IImage::SBufferCopy imgRegion = {
-				.imageSubresource = {
-					.aspectMask = imgBarrier.subresourceRange.aspectMask,
-					.layerCount = imgBarrier.subresourceRange.layerCount,
-				},
-				.imageExtent = importedImg->getCreationParameters().extent,
-			};
-
-			re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
-			re &= cmd[1]->end();
-			
-			IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 };
-			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 };
-			IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-			IQueue::SSubmitInfo submitInfo = { 
-				.waitSemaphores = {&waitInfo,&waitInfo + 1},
-				.commandBuffers = {&cmdInfo, &cmdInfo + 1},  
-				.signalSemaphores = {&signalInfo,&signalInfo + 1} 
-			};
-			auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-			re &= IQueue::RESULT::SUCCESS == submitRe;
-			if (!re)
-				logFail("Something went wrong copying results from CUDA");
-		}
+        queue = device_base_t::getComputeQueue();
+        
+        createResources();
+
+        smart_refctd_ptr<ICPUBuffer> ptx;
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            // this time we load a shader directly from a file
+            auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return logFail("Could not load kernel!");
+
+            smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+            std::string log;
+            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+            ASSERT_SUCCESS_NV(res, log);
+
+            ptx = std::move(ptx_);
+        }
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+        launchKernel(kernel, stream);
+
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
+        m_device->waitIdle();
+        return true;
+    }
+
+    void createResources()
+    {
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        for (auto& buf : cpuBufs)
+            buf = make_smart_refctd_ptr<ICPUBuffer>(size);
+
+        for (auto j = 0; j < 2; j++)
+            for (auto i = 0; i < numElements; i++)
+                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+
+
+        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        
+        importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get()));
+        {
+            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+
+
+            // create an importing external buffer on Nabla side
+            IGPUBuffer::SCreationParams params = {};
+            params.size = devmemory->getAllocationSize();
+            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedBuf = m_device->createBuffer(std::move(params));
+            if (!importedBuf) logFail("Failed to create an external buffer");
+
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindBufferMemory(1, &bindInfo);
+            if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+
+        {
+            // same thing as above
+            // we create an external image and bind the imported external memory to it
+            // now we have 2 different resources that are bound to the same memory
+            IImage::SCreationParams params = {};
+            params.type = IGPUImage::ET_2D;
+            params.samples = IGPUImage::ESCF_1_BIT;
+            params.format = EF_R32_SFLOAT;
+            params.extent = { gridDim[0], blockDim[0], 1 };
+            params.mipLevels = 1;
+            params.arrayLayers = 1;
+            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+            importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
+            if (!importedImg) logFail("Failed to create an external image");
+        }
+        
+        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+
+        auto createStaging = [this,logicalDevice= m_device]()
+        {
+            auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+            auto req = buf->getMemoryReqs();
+            req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+            auto allocation = logicalDevice->allocate(req, buf.get());
+            
+            void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+            if (!mapping) 
+                logFail("Failed to map an staging buffer");
+            memset(mapping, 0, req.size);
+            return buf;
+        };
+
+        stagingBufs[0] = createStaging();
+        stagingBufs[1] = createStaging();
+    }
+
+    void launchKernel(CUfunction kernel, CUstream stream)
+    {
+
+        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                    .barrier = {
+                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                    },
+                    .range = {.buffer = importedBuf, },
+            };
+
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = {
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                }
+            };
+            // start recording
+            bool re = true;
+            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->end();
+
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong readying resources for CUDA");
+        }
+        
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // Launch kernel
+        {
+            CUdeviceptr ptrs[] = {
+                cudaMemories[0]->getDeviceptr(),
+                cudaMemories[1]->getDeviceptr(),
+                cudaMemories[2]->getDeviceptr(),
+            };
+            auto numEles = numElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+
+            auto semaphore = cudaSemaphore->getInternalObject();
+            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+        }
         
-		ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
-	}
-
-	void kernelCallback()
-	{
-		// Make sure we are also done with the readback
-		auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}};
-		m_device->waitForSemaphores(wait, true, -1);
-
-		float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
-		float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
-
-		float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
-		float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
-
-		if(memcmp(CBuf, CImg, size))
-			logFail("Buffer and Image memories do not match!");
-
-		for (auto i = 0; i < numElements; i++)
-		{
-			bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-			if(!re)
-				logFail("Element at index %d is incorrect!", i);
-		}
-		
-		std::cout << "Success\n";
-	}
-
-	// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
-	bool keepRunning() override { return false; }
-
-	// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
-	void workLoopBody() override {}
+        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .buffer = importedBuf, },
+            };
+            bool re = true;
+            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = { 
+                    .dep = { 
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                },
+                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            };
+
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+
+            IImage::SBufferCopy imgRegion = {
+                .imageSubresource = {
+                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
+                    .layerCount = imgBarrier.subresourceRange.layerCount,
+                },
+                .imageExtent = importedImg->getCreationParameters().extent,
+            };
+
+            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+            re &= cmd[1]->end();
+            
+            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = {&waitInfo,&waitInfo + 1},
+                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong copying results from CUDA");
+        }
+        
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    }
+
+    void kernelCallback()
+    {
+        // Make sure we are also done with the readback
+        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}};
+        m_device->waitForSemaphores(wait, true, -1);
+
+        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+
+        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+
+        if(memcmp(CBuf, CImg, size))
+            logFail("Buffer and Image memories do not match!");
+
+        for (auto i = 0; i < numElements; i++)
+        {
+            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+            if(!re)
+                logFail("Element at index %d is incorrect!", i);
+        }
+        
+        std::cout << "Success\n";
+    }
+
+    // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+    bool keepRunning() override { return false; }
+
+    // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+    void workLoopBody() override {}
 };
 
 NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file

From 73f147941ef5362d0adee47ae72b4088b8c49aa5 Mon Sep 17 00:00:00 2001
From: atkurtul <atilkurtulmus@gmail.com>
Date: Sat, 20 Jan 2024 01:24:13 +0300
Subject: [PATCH 17/17] fix some comments & add test code

---
 63_CUDAInterop/main.cpp | 179 ++++++++++++++++++++++++++++++++++------
 1 file changed, 155 insertions(+), 24 deletions(-)

diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
index 1d67dbee3..9508fa22a 100644
--- a/63_CUDAInterop/main.cpp
+++ b/63_CUDAInterop/main.cpp
@@ -75,7 +75,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
     // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
     // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
     std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
-    // A semaphore created in CUDA which will be imported into Nabla to help sync between the CUDA kernel and and Nabla device to host transfer
+    // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
     smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
 
     // our Buffer that is bound to cudaMemories[2]
@@ -86,8 +86,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
     // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
     smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
 
-    // Nabla semaphore that aliases the cudaSemaphore above
-    smart_refctd_ptr<ISemaphore> importedSemaphore;
+    // Nabla semaphore for sync
+    smart_refctd_ptr<ISemaphore> semaphore;
 
     smart_refctd_ptr<IGPUCommandPool> commandPool;
     smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
@@ -161,6 +161,9 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
         ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
 
         m_device->waitIdle();
+        
+        testInterop();
+
         return true;
     }
 
@@ -181,8 +184,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
         ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
         ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
         
-        importedSemaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, importedSemaphore.get()));
+        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
         {
             // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
             auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
@@ -196,7 +199,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
             params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
             params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
             importedBuf = m_device->createBuffer(std::move(params));
-            if (!importedBuf) logFail("Failed to create an external buffer");
+            if (!importedBuf) 
+                logFail("Failed to create an external buffer");
 
             // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
             ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
@@ -223,24 +227,36 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
         commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
 
-        auto createStaging = [this,logicalDevice= m_device]()
-        {
-            auto buf = logicalDevice->createBuffer({ {.size = size, .usage = asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-            auto req = buf->getMemoryReqs();
-            req.memoryTypeBits &= logicalDevice->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-            auto allocation = logicalDevice->allocate(req, buf.get());
-            
-            void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-            if (!mapping) 
-                logFail("Failed to map an staging buffer");
-            memset(mapping, 0, req.size);
-            return buf;
-        };
-
         stagingBufs[0] = createStaging();
         stagingBufs[1] = createStaging();
     }
 
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+        auto allocation = m_device->allocate(req, buf.get());
+
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
     void launchKernel(CUfunction kernel, CUstream stream)
     {
 
@@ -272,7 +288,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
             re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
             re &= cmd[0]->end();
 
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 1 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
             IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
             IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
             auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
@@ -355,8 +371,8 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
             re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
             re &= cmd[1]->end();
             
-            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = importedSemaphore.get(), .value = 2 };
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = importedSemaphore.get(), .value = 3 };
+            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
             IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
             IQueue::SSubmitInfo submitInfo = { 
                 .waitSemaphores = {&waitInfo,&waitInfo + 1},
@@ -375,7 +391,7 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
     void kernelCallback()
     {
         // Make sure we are also done with the readback
-        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = importedSemaphore.get(), .value = 3}};
+        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
         m_device->waitForSemaphores(wait, true, -1);
 
         float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
@@ -397,6 +413,121 @@ class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::Mono
         std::cout << "Success\n";
     }
 
+
+    void testInterop()
+    {
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+
+            for (size_t i = 0; i < (1 << 8); ++i)
+            {
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+            }
+        }
+
+        smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+
+            auto tmpBuf = createExternalBuffer(memory.get());
+            auto staging = createStaging();
+
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                ptr[i] = i;
+
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            queue->submit({ &submitInfo,&submitInfo + 1 });
+            m_device->waitIdle();
+            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+        }
+
+        //{
+        //    constexpr size_t M = 32;
+        //    auto staging = createStaging(size * M);
+
+        //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+        //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
+        //        ptr[i] = rand();
+
+        //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+        //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+
+        //    for (size_t i = 0; i < 1 << 10; ++i)
+        //    {
+        //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+        //            .size = size * M,
+        //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+        //            .alignmentLog2 = 10,
+        //        };
+        //    RE:
+        //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+
+        //        if (!memory)
+        //        {
+        //            m_device->waitIdle();
+        //            for (size_t j = 0; j < i; ++j)
+        //                cmd[j] = 0;
+        //            goto END;
+        //        }
+        //        assert(memory);
+        //        auto tmpBuf = createExternalBuffer(memory.get());
+
+        //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+        //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+        //        cmd[i]->end();
+        //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+        //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+        //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+        //    }
+        //END:
+        //    m_device->waitIdle();
+        //}
+
+        {
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging();
+
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
+            m_device->waitIdle();
+
+            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                assert(ptr[i] == i);
+        }
+
+    }
+
+
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
     bool keepRunning() override { return false; }