diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
index d9a8d9b1a..28872a5f5 100644
--- a/03_DeviceSelectionAndSharedSources/main.cpp
+++ b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -32,7 +32,7 @@ class DeviceSelectionAndSharedSourcesApp final : public examples::MonoDeviceAppl
 		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
 		{
 			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(std::move(system)))
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
 				return false;
 			if (!asset_base_t::onAppInitialized(std::move(system)))
 				return false;
diff --git a/22_CppCompat/test.hlsl.orig b/22_CppCompat/test.hlsl.orig
deleted file mode 100644
index b67b08062..000000000
--- a/22_CppCompat/test.hlsl.orig
+++ /dev/null
@@ -1,264 +0,0 @@
-//// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-//// This file is part of the "Nabla Engine".
-//// For conditions of distribution and use, see copyright notice in nabla.h
-
-#pragma shader_stage(compute)
-
-#define STATIC_ASSERT(C) { nbl::hlsl::conditional<C, int, void>::type a = 0; }
-
-#define IS_SAME(L,R) nbl::hlsl::is_same<L,R>::value
-#define SHADER_CRASHING_ASSERT(expr) \
-{ \
-    bool con = (expr); \
-    do { \
-        [branch] if (!con) \
-          vk::RawBufferStore<uint32_t>(0xdeadbeefBADC0FFbull,0x45u,4u); \
-    } while(!con); \
-} 
-
-
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
-
-#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
-
-#include <nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/decodeCIEXYZ.hlsl>
-#include <nbl/builtin/hlsl/colorspace/EOTF.hlsl>
-#include <nbl/builtin/hlsl/colorspace/OETF.hlsl>
-
-#include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
-
-#include <nbl/builtin/hlsl/mpl.hlsl>
-#include <nbl/builtin/hlsl/bit.hlsl>
-
-#include <nbl/builtin/hlsl/limits.hlsl>
-
-struct PushConstants
-{
-	uint2 imgSize;
-};
-
-[[vk::push_constant]]
-PushConstants u_pushConstants;
-
-[[vk::binding(0, 0)]] Texture2D<float4> inImage;
-[[vk::binding(1, 0)]] RWTexture2D<float4> outImage;
-[[vk::binding(2, 0)]] Buffer<float4> inBuffer;
-[[vk::binding(3, 0)]] RWStructuredBuffer<float4> outBuffer;
-
-
-template<int A>
-struct Spec
-{
-    static const int value = Spec<A-1>::value + 1;
-};
-
-template<>
-struct Spec<0>
-{
-    static const int value = 0;
-};
-
-Buffer<float32_t4>  unbounded[];
-
-template<class T>
-bool val(T) { return nbl::hlsl::is_unbounded_array<T>::value; }
-
-
-template<typename T, uint32_t N>
-struct array
-{
-  T data[N];
-};
-
-void fill(uint3 invocationID, float val)
-{
-    outImage[invocationID.xy] = float4(val,val,val,val);
-    outBuffer[invocationID.x * invocationID.y] = float4(val,val,val,val);
-}
-
-void fill(uint3 invocationID, float4 val)
-{
-    outImage[invocationID.xy] = val;
-    outBuffer[invocationID.x * invocationID.y] = val;
-}
-
-
-[numthreads(8, 8, 1)]
-void main(uint3 invocationID : SV_DispatchThreadID)
-{
-    fill(invocationID, 1);
-    const float32_t3 TEST_VEC = float32_t3(1.0f, 2.0f, 3.0f);
-
-    fill(invocationID, 2);
-    // test functions from EOTF.hlsl
-    nbl::hlsl::colorspace::eotf::identity<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::impl_shared_2_4<float32_t3>(TEST_VEC, 0.5f);
-    nbl::hlsl::colorspace::eotf::sRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::Display_P3<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::DCI_P3_XYZ<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::SMPTE_170M<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::SMPTE_ST2084<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::HDR10_HLG<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::AdobeRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::Gamma_2_2<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::ACEScc<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::eotf::ACEScct<float32_t3>(TEST_VEC);
-    
-    fill(invocationID, 3);
-
-    // test functions from OETF.hlsl
-    nbl::hlsl::colorspace::oetf::identity<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::impl_shared_2_4<float32_t3>(TEST_VEC, 0.5f);
-    nbl::hlsl::colorspace::oetf::sRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::Display_P3<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::DCI_P3_XYZ<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::SMPTE_170M<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::SMPTE_ST2084<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::HDR10_HLG<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::AdobeRGB<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::Gamma_2_2<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::ACEScc<float32_t3>(TEST_VEC);
-    nbl::hlsl::colorspace::oetf::ACEScct<float32_t3>(TEST_VEC);
-    
-    fill(invocationID, 4);
-    // xoroshiro tests
-    const uint32_t2 state = uint32_t2(12u, 34u);
-    nbl::hlsl::Xoroshiro64Star xoroshiro64Star = nbl::hlsl::Xoroshiro64Star::construct(state);
-    xoroshiro64Star();
-    nbl::hlsl::Xoroshiro64StarStar xoroshiro64StarStar = nbl::hlsl::Xoroshiro64StarStar::construct(state);
-    xoroshiro64StarStar();
-    
-    //nbl::hlsl::mpl::countl_zero<2ull>::value;
-    
-    // TODO: test if std::rotl/r == nbl::hlsl::rotr/l == nbl::hlsl::mpl::rotr/l
-    
-    // uint32_t mplRotlResult0 = nbl::hlsl::mpl::rotl<uint32_t, 2u, 1>::value;
-    // uint32_t mplRotlResult1 = nbl::hlsl::mpl::rotl<uint32_t, 2u, -1>::value;
-    // uint32_t mplRotrResult0 = nbl::hlsl::mpl::rotr<uint32_t, 2u, 1>::value;
-    // uint32_t mplRotrResult1 = nbl::hlsl::mpl::rotr<uint32_t, 2u, -1>::value;
-    
-    // uint32_t rotlResult0 = nbl::hlsl::mpl::rotl<uint32_t, 2u, 1>::value;
-    // uint32_t rotlResult1 = nbl::hlsl::mpl::rotl<uint32_t, 2u, -1>::value;
-    // uint32_t rotrResult0 = nbl::hlsl::mpl::rotr<uint32_t, 2u, 1>::value;
-    // uint32_t rotrResult1 = nbl::hlsl::mpl::rotr<uint32_t, 2u, -1>::value;
-    
-    // SHADER_CRASHING_ASSERT(rotlResult0 == mplRotlResult0);
-    // SHADER_CRASHING_ASSERT(rotlResult1 == mplRotlResult1);
-    // SHADER_CRASHING_ASSERT(rotrResult0 == mplRotrResult0);
-    // SHADER_CRASHING_ASSERT(rotrResult1 == mplRotrResult1);
-
-    // TODO: more tests and compare with cpp version as well
-    fill(invocationID, 5);
-    // countl_zero test
-    {
-        static const uint16_t TEST_VALUE_0 = 5;
-        static const uint32_t TEST_VALUE_1 = 0x80000000u;
-        static const uint32_t TEST_VALUE_2 = 0x8000000000000000u;
-        static const uint32_t TEST_VALUE_3 = 0x00000001u;
-        static const uint32_t TEST_VALUE_4 = 0x0000000000000001u;
-        
-
-        fill(invocationID, 5.01);
-        uint16_t compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint16_t, TEST_VALUE_0>::value;
-        uint16_t runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_0);
-        fill(invocationID, float4(5.1, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_1>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_1);
-        fill(invocationID, float4(5.2, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_2>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_2);
-        fill(invocationID, float4(5.3, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_3>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_3);
-        fill(invocationID, float4(5.4, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-
-        compileTimeCountLZero = nbl::hlsl::mpl::countl_zero<uint32_t, TEST_VALUE_4>::value;
-        runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_4);
-        fill(invocationID, float4(5.5, compileTimeCountLZero, runTimeCountLZero, 0));
-        SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero);
-    }
-
-    {
-        bool A = Spec<3>::value == 3;
-    }
-    {
-        bool A = nbl::hlsl::is_integral<int>::value;
-    }
-    {
-        bool A = val(unbounded);
-    }
-    fill(invocationID, 6);
-    {
-        float4 v;
-        fill(invocationID, float4(alignof(v.x), alignof(v), 0, 0));
-        SHADER_CRASHING_ASSERT(alignof(v.x) == alignof(v));
-    }
-    
-    {
-        float4 v;
-        const volatile float4 u;
-
-        STATIC_ASSERT(IS_SAME(decltype(v.x), nbl::hlsl::impl::add_lvalue_reference<decltype(v.x)>::type));
-        STATIC_ASSERT(nbl::hlsl::impl::is_reference<decltype(v.x)>::value);
-        STATIC_ASSERT(IS_SAME(float,nbl::hlsl::impl::remove_reference<decltype(v.x)>::type));
-        STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference<float>::type));
-        STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference<nbl::hlsl::impl::remove_reference<decltype(v.x)>::type>::type));
-        
-        STATIC_ASSERT(IS_SAME(float,nbl::hlsl::remove_cvref<decltype(v.x)>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv<decltype(v.x)>::type,nbl::hlsl::impl::add_lvalue_reference<float>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv<decltype(v.x)>::type,nbl::hlsl::impl::add_lvalue_reference<nbl::hlsl::remove_cvref<decltype(v.x)>::type>::type));
-    }
-    fill(invocationID, 7);
-    {
-        float x[4][4];
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_extent<decltype(x)>::type, float[4]));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_all_extents<decltype(x)>::type, float));
-    }
-    fill(invocationID, 8);
-    {
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int16_t>::type,   nbl::hlsl::make_signed<uint16_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int16_t>::type, nbl::hlsl::make_unsigned<uint16_t>::type));
-
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int32_t>::type,   nbl::hlsl::make_signed<uint32_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int32_t>::type, nbl::hlsl::make_unsigned<uint32_t>::type));
-
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed<int64_t>::type,   nbl::hlsl::make_signed<uint64_t>::type));
-        STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned<int64_t>::type, nbl::hlsl::make_unsigned<uint64_t>::type));
-    }
-
-    {
-        int Q[3][4][5];
-        STATIC_ASSERT(3 == (nbl::hlsl::extent<decltype(Q), 0>::value));
-        STATIC_ASSERT(4 == (nbl::hlsl::extent<decltype(Q), 1>::value));
-        STATIC_ASSERT(5 == (nbl::hlsl::extent<decltype(Q), 2>::value));
-        STATIC_ASSERT(0 == (nbl::hlsl::extent<decltype(Q), 3>::value));
-    }
-    fill(invocationID, 9);
-    {
-        float32_t a;
-
-        a = nbl::hlsl::numeric_limits<float32_t>::min;
-        a = nbl::hlsl::numeric_limits<float32_t>::max;
-        a = nbl::hlsl::numeric_limits<float32_t>::lowest;
-        a = nbl::hlsl::numeric_limits<float32_t>::epsilon;
-        a = nbl::hlsl::numeric_limits<float32_t>::round_error;
-        a = nbl::hlsl::numeric_limits<float32_t>::infinity;
-        a = nbl::hlsl::numeric_limits<float32_t>::quiet_NaN;
-        a = nbl::hlsl::numeric_limits<float32_t>::signaling_NaN;
-        a = nbl::hlsl::numeric_limits<float32_t>::denorm_min;
-    }
-
-    fill(invocationID, 10);
-
-    fill(invocationID, -1);
-}
diff --git a/63_CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/63_CUDAInterop/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/63_CUDAInterop/app_resources/vectorAdd_kernel.cu b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu
new file mode 100644
index 000000000..99c831121
--- /dev/null
+++ b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+                                     int numElements) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < numElements) {
+    C[i] = A[i] + B[i];
+  }
+}
diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp
new file mode 100644
index 000000000..9508fa22a
--- /dev/null
+++ b/63_CUDAInterop/main.cpp
@@ -0,0 +1,538 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#include "nbl/video/CCUDAHandler.h"
+#include "nbl/video/CCUDASharedMemory.h"
+#include "nbl/video/CCUDASharedSemaphore.h"
+
+#include "../common/MonoDeviceApplication.hpp"
+#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+/*
+The start of the main function starts like in most other example. We ask the
+user for the desired renderer and start it up.
+*/
+
+bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
+{
+    if (auto re = err; CUDA_SUCCESS != re) 
+    {
+        const char* name = 0, * str = 0;
+        cu.pcuGetErrorName(re, &name);
+        cu.pcuGetErrorString(re, &str);
+        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
+        return false;
+    }
+    return true;
+}
+
+bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
+{
+    if (auto re = err; NVRTC_SUCCESS != re) 
+    {
+        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
+        return false;
+    }
+    return true;
+}
+
+#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
+#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+
+#ifndef _NBL_COMPILE_WITH_CUDA_
+static_assert(false);
+#endif
+
+class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
+{
+    using device_base_t = examples::MonoDeviceApplication;
+    using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;
+
+    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
+    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
+    static constexpr size_t numElements = gridDim[0] * blockDim[0];
+    static constexpr size_t size = sizeof(float) * numElements;
+public:
+    // Yay thanks to multiple inheritance we cannot forward ctors anymore
+    CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+    smart_refctd_ptr<CCUDAHandler> cudaHandler;
+    smart_refctd_ptr<CCUDADevice> cudaDevice;
+
+    IQueue* queue;
+
+    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
+    // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+    // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
+    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
+    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+
+    // our Buffer that is bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUBuffer> importedBuf;
+    // our Image that is also bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUImage> importedImg;
+
+    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
+    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+
+    // Nabla semaphore for sync
+    smart_refctd_ptr<ISemaphore> semaphore;
+
+    smart_refctd_ptr<IGPUCommandPool> commandPool;
+    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
+
+    // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
+    core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const override
+    {
+        auto devices = device_base_t::filterDevices(physicalDevices);
+        auto& cuDevices = cudaHandler->getAvailableDevices();
+        std::erase_if(devices, [&cuDevices](auto pdev) {
+            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
+        });
+        return devices;
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+
+        cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+        if (!cudaHandler) 
+            return logFail("Could not create a CUDA handler!");
+
+        if (!device_base_t::onAppInitialized(std::move(system)))
+            return false;
+
+        cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+        if (!cudaDevice) 
+            return logFail("Could not create a CUDA Device!");
+
+        
+        queue = device_base_t::getComputeQueue();
+        
+        createResources();
+
+        smart_refctd_ptr<ICPUBuffer> ptx;
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            // this time we load a shader directly from a file
+            auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return logFail("Could not load kernel!");
+
+            smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+            std::string log;
+            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+            ASSERT_SUCCESS_NV(res, log);
+
+            ptx = std::move(ptx_);
+        }
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+        launchKernel(kernel, stream);
+
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
+        m_device->waitIdle();
+        
+        testInterop();
+
+        return true;
+    }
+
+    void createResources()
+    {
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        for (auto& buf : cpuBufs)
+            buf = make_smart_refctd_ptr<ICPUBuffer>(size);
+
+        for (auto j = 0; j < 2; j++)
+            for (auto i = 0; i < numElements; i++)
+                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+
+
+        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        
+        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        {
+            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+
+
+            // create an importing external buffer on Nabla side
+            IGPUBuffer::SCreationParams params = {};
+            params.size = devmemory->getAllocationSize();
+            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedBuf = m_device->createBuffer(std::move(params));
+            if (!importedBuf) 
+                logFail("Failed to create an external buffer");
+
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindBufferMemory(1, &bindInfo);
+            if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+
+        {
+            // same thing as above
+            // we create an external image and bind the imported external memory to it
+            // now we have 2 different resources that are bound to the same memory
+            IImage::SCreationParams params = {};
+            params.type = IGPUImage::ET_2D;
+            params.samples = IGPUImage::ESCF_1_BIT;
+            params.format = EF_R32_SFLOAT;
+            params.extent = { gridDim[0], blockDim[0], 1 };
+            params.mipLevels = 1;
+            params.arrayLayers = 1;
+            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+            importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
+            if (!importedImg) logFail("Failed to create an external image");
+        }
+        
+        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+
+        stagingBufs[0] = createStaging();
+        stagingBufs[1] = createStaging();
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+        auto allocation = m_device->allocate(req, buf.get());
+
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
+    void launchKernel(CUfunction kernel, CUstream stream)
+    {
+
+        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                    .barrier = {
+                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                    },
+                    .range = {.buffer = importedBuf, },
+            };
+
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = {
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                }
+            };
+            // start recording
+            bool re = true;
+            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->end();
+
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong readying resources for CUDA");
+        }
+        
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // Launch kernel
+        {
+            CUdeviceptr ptrs[] = {
+                cudaMemories[0]->getDeviceptr(),
+                cudaMemories[1]->getDeviceptr(),
+                cudaMemories[2]->getDeviceptr(),
+            };
+            auto numEles = numElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+
+            auto semaphore = cudaSemaphore->getInternalObject();
+            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+        }
+        
+        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .buffer = importedBuf, },
+            };
+            bool re = true;
+            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = { 
+                    .dep = { 
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                },
+                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            };
+
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+
+            IImage::SBufferCopy imgRegion = {
+                .imageSubresource = {
+                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
+                    .layerCount = imgBarrier.subresourceRange.layerCount,
+                },
+                .imageExtent = importedImg->getCreationParameters().extent,
+            };
+
+            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+            re &= cmd[1]->end();
+            
+            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = {&waitInfo,&waitInfo + 1},
+                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong copying results from CUDA");
+        }
+        
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    }
+
+    void kernelCallback()
+    {
+        // Make sure we are also done with the readback
+        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
+        m_device->waitForSemaphores(wait, true, -1);
+
+        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+
+        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+
+        if(memcmp(CBuf, CImg, size))
+            logFail("Buffer and Image memories do not match!");
+
+        for (auto i = 0; i < numElements; i++)
+        {
+            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+            if(!re)
+                logFail("Element at index %d is incorrect!", i);
+        }
+        
+        std::cout << "Success\n";
+    }
+
+
+    void testInterop()
+    {
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+
+            for (size_t i = 0; i < (1 << 8); ++i)
+            {
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+            }
+        }
+
+        smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+
+            auto tmpBuf = createExternalBuffer(memory.get());
+            auto staging = createStaging();
+
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                ptr[i] = i;
+
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            queue->submit({ &submitInfo,&submitInfo + 1 });
+            m_device->waitIdle();
+            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+        }
+
+        //{
+        //    constexpr size_t M = 32;
+        //    auto staging = createStaging(size * M);
+
+        //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+        //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
+        //        ptr[i] = rand();
+
+        //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+        //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+
+        //    for (size_t i = 0; i < 1 << 10; ++i)
+        //    {
+        //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+        //            .size = size * M,
+        //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+        //            .alignmentLog2 = 10,
+        //        };
+        //    RE:
+        //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+
+        //        if (!memory)
+        //        {
+        //            m_device->waitIdle();
+        //            for (size_t j = 0; j < i; ++j)
+        //                cmd[j] = 0;
+        //            goto END;
+        //        }
+        //        assert(memory);
+        //        auto tmpBuf = createExternalBuffer(memory.get());
+
+        //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+        //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+        //        cmd[i]->end();
+        //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+        //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+        //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+        //    }
+        //END:
+        //    m_device->waitIdle();
+        //}
+
+        {
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging();
+
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
+            m_device->waitIdle();
+
+            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                assert(ptr[i] == i);
+        }
+
+    }
+
+
+    // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+    bool keepRunning() override { return false; }
+
+    // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+    void workLoopBody() override {}
+};
+
+NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file
diff --git a/63_CUDAInterop/pipeline.groovy b/63_CUDAInterop/pipeline.groovy
new file mode 100644
index 000000000..29909e3d1
--- /dev/null
+++ b/63_CUDAInterop/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CCUDAInteropBuilder extends IBuilder
+{
+	public CCUDAInteropBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CCUDAInteropBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a20a33a9..feb8307a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
 	#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+    add_subdirectory(63_CUDAInterop EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()
\ No newline at end of file
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp
index ca4e6d449..4e6a6bf95 100644
--- a/common/MonoDeviceApplication.hpp
+++ b/common/MonoDeviceApplication.hpp
@@ -245,6 +245,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 			return retval;
 		}
 
+
 		virtual video::IQueue* getQueue(video::IQueue::FAMILY_FLAGS flags) const
 		{
 			// In the default implementation of everything I asked only for one queue from first compute family
@@ -252,7 +253,6 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication
 			for (auto i = 0u; i < familyProperties.size(); i++)
 				if (familyProperties[i].queueFlags.hasFlags(flags))
 					return m_device->getQueue(i, 0);
-
 			return nullptr;
 		}