diff --git a/svm/README.md b/svm/README.md index 21a15e03..802373a7 100644 --- a/svm/README.md +++ b/svm/README.md @@ -16,6 +16,12 @@ SYCL for Nvidia backend - CC=clang CXX=clang++ cmake -DUSE_NVIDIA_BACKEND=TRUE - SYCL for AMD backend - CC=clang CXX=clang++ cmake -DUSE_AMD_BACKEND=TRUE -DUSE_AMD_ARCH={flag for hip i.e 90a for MI250} ../ \ make \ + +### In-order queue +The CMake option `-DIN_ORDER_QUEUE` adds the `in_order` property to the SYCL +queue, as well as `discard_events` if available. The default value of this +option is `ON` for NVIDIA and AMD backends, and `OFF` otherwise. + ### Running the workload ./svm_sycl a9a a.m diff --git a/svm/SYCL/CMakeLists.txt b/svm/SYCL/CMakeLists.txt index d17fd105..bc5f3916 100644 --- a/svm/SYCL/CMakeLists.txt +++ b/svm/SYCL/CMakeLists.txt @@ -61,6 +61,12 @@ option(GPU_AOT "Build AOT for Intel GPU" OFF) option(USE_NVIDIA_BACKEND "Build for NVIDIA backend" OFF) option(USE_AMD_BACKEND "Build for AMD HIP backend" OFF) +set(IN_ORDER_QUEUE_DEFAULT OFF) +if (${USE_NVIDIA_BACKEND} OR ${USE_AMD_BACKEND}) + set(IN_ORDER_QUEUE_DEFAULT ON) +endif() +option(IN_ORDER_QUEUE "Use in-order SYCL queue" ${IN_ORDER_QUEUE_DEFAULT}) + set(SOURCES cuSVM/cuSVMSolver.dp.cpp cuSVM/cuSVM_wrapper.cpp @@ -78,7 +84,8 @@ set(SOURCES include_directories(${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/libSVM ${CMAKE_SOURCE_DIR}/cuSVM - ${CMAKE_SOURCE_DIR}/infrastructure) + ${CMAKE_SOURCE_DIR}/infrastructure + ${CMAKE_SOURCE_DIR}/../../infrastructure) # Use either default or user defined CXX flags @@ -91,6 +98,12 @@ set(DEF_INTEL_GENERAL_CXX_FLAGS " -O3 -fsycl ") set(DEF_NVIDIA_GENERAL_CXX_FLAGS " -O3 -fsycl ") set(DEF_AMD_GENERAL_CXX_FLAGS " -O3 -fsycl -D__HIP_PLATFORM_AMD__ ") +if (${IN_ORDER_QUEUE}) + string(APPEND DEF_INTEL_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ") + string(APPEND DEF_NVIDIA_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ") + string(APPEND DEF_AMD_GENERAL_CXX_FLAGS " -DIN_ORDER_QUEUE ") +endif() + if(NOT "${CMAKE_CXX_FLAGS}" STREQUAL "" AND NOT "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "") message(FATAL_ERROR "Both CMAKE_CXX_FLAGS and OVERRIDE_GENERAL_CXX_FLAGS cannot be passed in together") elseif("${CMAKE_CXX_FLAGS}" STREQUAL "" AND "${OVERRIDE_GENERAL_CXX_FLAGS}" STREQUAL "") diff --git a/svm/SYCL/common/SYCL.cpp b/svm/SYCL/common/SYCL.cpp deleted file mode 100755 index 7e2351a4..00000000 --- a/svm/SYCL/common/SYCL.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/* -MIT License - -Copyright (c) 2015 University of West Bohemia - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* -MIT License - -Modifications Copyright (C) 2023 Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -SPDX-License-Identifier: MIT License -*/ - -#include "SYCL.h" -#include "Utilities.h" -#include - -#define CPP_MODULE "SYCL" -#include "Logging.h" - - -cl::sycl::device SYCL::QueryAndGetDevice(std::string const &sDeviceType) -{ - if (sDeviceType.empty()) { - LOG("Using default selector()"); - return cl::sycl::device(cl::sycl::device(cl::sycl::default_selector())); - } - - LOG("Attemping to use " << sDeviceType << "_selector()"); - std::string const sLowerCaseDeviceType(Utility::ToLowerCase(sDeviceType)); - if (sLowerCaseDeviceType == "cpu") - return cl::sycl::device(cl::sycl::device(cl::sycl::cpu_selector())); - else if (sLowerCaseDeviceType == "gpu") - return cl::sycl::device(cl::sycl::device(cl::sycl::gpu_selector())); - else if (sLowerCaseDeviceType == "acc") - return cl::sycl::device(cl::sycl::device(cl::sycl::accelerator_selector())); - else if (sLowerCaseDeviceType == "host") - return cl::sycl::device(cl::sycl::device(cl::sycl::host_selector())); - else if (sLowerCaseDeviceType == "nvidia") - return cl::sycl::device(cl::sycl::device(nvidia_selector())); // Custom selector - else { - LOG_ERROR("Unknown device " << sDeviceType << "_selector()"); - } -} - -SYCL::SYCL(cl::sycl::device const &SelectedDevice) -{ - std::function lQExceptionHandler([&](cl::sycl::exception_list ExceptionList) { // lambda Q exception handler - try { - for (auto const &Exception : ExceptionList) { - std::rethrow_exception(Exception); - } - } catch (cl::sycl::exception e) { - LOG_ERROR("SYCL exception caught: " << e.what()); - }}); - - m_sySelectedQueue = cl::sycl::queue(SelectedDevice, lQExceptionHandler, {cl::sycl::property::queue::enable_profiling()}); - DisplayDeviceProperties(); -} - -void SYCL::DisplayDeviceProperties() const -{ - LOG("SYCL device initialization successful") - LOG("\t Using SYCL device : " << m_sySelectedQueue.get_device().get_info() << - " (Driver version " << m_sySelectedQueue.get_device().get_info() << ")"); - - std::vector const vLDDPaths(Utility::ExtractLDDPathNameFromProcess({"libOpenCL", "libsycl", "libComputeCpp"})); //[0] OCL, [1] Intel's SYCL, [2] ComputeCpp SYCL - if (vLDDPaths.empty()) { - LOG_WARNING("Unable to print OpenCL and SYCL dependent libraries! The LD_LIBRARY_PATH may be incorrectly set"); // Should not reach to this case - return; - } - - LOG("\t Using OpenCL library : " << (!vLDDPaths[0].empty() ? vLDDPaths[0] : "WARNING! OpenCL library not found!")); - - if (!vLDDPaths[1].empty()) - LOG("\t Using Intel's SYCL library: " << vLDDPaths[1]); - - if (!vLDDPaths[2].empty()) - LOG("\t Found ComputeCPP library : " << vLDDPaths[2]); -} - diff --git a/svm/SYCL/common/SYCL.h b/svm/SYCL/common/SYCL.h deleted file mode 100755 index 46a899f0..00000000 --- a/svm/SYCL/common/SYCL.h +++ /dev/null @@ -1,97 +0,0 @@ -/* -MIT License - -Copyright (c) 2015 University of West Bohemia - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* -MIT License - -Modifications Copyright (C) 2023 Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -SPDX-License-Identifier: MIT License -*/ - -#ifndef SYCL_CLASS_H -#define SYCL_CLASS_H - -#include -#include - -class SYCL -{ -private: - cl::sycl::queue m_sySelectedQueue; - cl::sycl::event m_syEventQueue; - - cl::sycl::platform GetPlatform(unsigned int const uiPlatformIndex) const; - cl::sycl::device GetDevice (std::string const &sDeviceType) const; - cl::sycl::device GetDevice (std::vector const &vSyclDevices, - unsigned int const uiDeviceIndex) const; - - std::string GetDeviceType(cl::sycl::device const &syclDevice) const; - - void DisplayDeviceProperties() const; -public: - // NVIDIA GPU selector - class nvidia_selector : public cl::sycl::device_selector - { - virtual int operator()(cl::sycl::device const &SYCLDevice) const override - { - if (!SYCLDevice.is_gpu()) - return -1; - - std::string const sVendor(SYCLDevice.get_info()); - return sVendor.find("NVIDIA") != std::string::npos ? 1000 : -1; - } - }; - - explicit SYCL(cl::sycl::device const &Device); - ~SYCL(){} - - SYCL (SYCL const &RHS) = delete; - SYCL &operator=(SYCL const &RHS) = delete; - - static cl::sycl::device QueryAndGetDevice(std::string const &sDeviceType); - - cl::sycl::queue &GetQueue() { return m_sySelectedQueue; } - cl::sycl::event &GetEventQueue() { return m_syEventQueue; } -}; - -#endif diff --git a/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp b/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp index 57232293..e38ea9df 100644 --- a/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp +++ b/svm/SYCL/cuSVM/cuSVMSolver.dp.cpp @@ -68,7 +68,7 @@ SPDX-License-Identifier: MIT License #include #include #include "CommandLineParser.h" -//#include "SYCL.h" +#include "SYCL.h" #ifdef USE_CUBLAS #include @@ -487,20 +487,16 @@ float *d_SelfDotProd,const int& m,const int& n,const int &nbrCtas,const int& thr #if USE_CUBLAS cublasHandle_t handle; CHECK_ERROR(cublasCreate(&handle)); + CUstream cuStream; - q_ct1.submit([&](sycl::handler &cgh) { - //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { + SYCL::ExecNativeCommand(q_ct1, [=, &cuStream](sycl::interop_handle ih) { cuCtxSetCurrent(ih.get_native_context()); - auto cuStream = ih.get_native_queue(); + cuStream = ih.template get_native_queue(); cublasSetStream(handle, cuStream); constexpr float ALPHA = 1.0f; constexpr float BETA = 0.0f; CHECK_ERROR(cublasSgemv (handle, CUBLAS_OP_N, m, n, &ALPHA, d_x, m, d_Kernel_InterRow, 1, &BETA, d_KernelDotProd, 1)); - cudaStreamSynchronize(cuStream); - //cudaDeviceSynchronize(); - }); - }).wait_and_throw(); + }, [&cuStream]{cudaStreamSynchronize(cuStream);}); cublasDestroy(handle); @@ -513,17 +509,15 @@ float *d_SelfDotProd,const int& m,const int& n,const int &nbrCtas,const int& thr hipblasCreate(&handle); - q_ct1.submit([&](sycl::handler &h){ - - h.host_task([=](sycl::interop_handle ih) { + SYCL::ExecNativeCommand(q_ct1, [&cuStream, =](sycl::interop_handle ih) { //hipCtxSetCurrent(ih.get_native_context()); //hipblasSetStream(handle, ih.get_native_queue()); hipblasSgemv (handle, HIPBLAS_OP_N, m, n, &ALPHA, d_x, m, d_Kernel_InterRow, 1, &BETA, d_KernelDotProd, 1); - }); - }).wait_and_throw(); + }, []{}); #else - oneapi::mkl::blas::column_major::gemv(q_ct1, oneapi::mkl::transpose::nontrans, m, n, 1, d_x, m, d_Kernel_InterRow, 1, 0, d_KernelDotProd, 1).wait(); + oneapi::mkl::blas::column_major::gemv(q_ct1, oneapi::mkl::transpose::nontrans, m, n, 1, d_x, m, d_Kernel_InterRow, 1, 0, d_KernelDotProd, 1); + q_ct1.wait(); #endif @@ -806,12 +800,18 @@ extern "C" void SVMTrain(float *mexalpha, float *beta, float *y, float *x, auto max_wgroup_size = selected_device.get_info(); printf("Workgroup Size: %lu\n", max_wgroup_size); - #if KERNEL_USE_PROFILE - auto propList = sycl::property_list{sycl::property::queue::enable_profiling()}; - sycl::queue q_ct1(context, selected_device, propList); - #else - sycl::queue q_ct1(context, selected_device); - #endif + auto propList = sycl::property_list{ + #if IN_ORDER_QUEUE + sycl::property::queue::in_order{}, + #ifdef SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS + sycl::ext::oneapi::property::queue::discard_events{}, + #endif + #endif + #if KERNEL_USE_PROFILE + sycl::property::queue::enable_profiling{}, + #endif + }; + sycl::queue q_ct1(context, selected_device, propList); mxArray *mexelapsed =mxCreateNumericMatrix(1, 1,mxSINGLE_CLASS, mxREAL); @@ -873,7 +873,8 @@ _kernelwidth*=-1; mxCUDA_SAFE_CALL((d_x = sycl::malloc_device(m * n * sizeof(float), q_ct1), 0)); mxCUDA_SAFE_CALL((d_xT = sycl::malloc_device(m * n * sizeof(float), q_ct1), 0)); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_x, x, sizeof(float) * n * m).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_x, x, sizeof(float) * n * m), 0)); + q_ct1.wait(); sycl::range<3> gridtranspose(ceil((float)m / TRANS_BLOCK_DIM), @@ -937,7 +938,8 @@ _kernelwidth*=-1; float *xT=new float [n*m]; - mxCUDA_SAFE_CALL((q_ct1.memcpy(xT, d_xT, sizeof(float) * m * n).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(xT, d_xT, sizeof(float) * m * n), 0)); + q_ct1.wait(); (sycl::free(d_xT, q_ct1), 0); @@ -949,10 +951,11 @@ _kernelwidth*=-1; mxCUDA_SAFE_CALL((d_SelfDotProd = sycl::malloc_device(m * sizeof(float), q_ct1), 0)); mxCUDA_SAFE_CALL((d_KernelDotProd = sycl::malloc_device(m * sizeof(float), q_ct1), 0)); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_y, y, sizeof(float) * m).wait(), 0)); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_alpha, h_alpha, sizeof(float) * m).wait(), 0)); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_F, h_F, sizeof(float) * m).wait(), 0)); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_SelfDotProd, SelfDotProd, sizeof(float) * m).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_y, y, sizeof(float) * m), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_alpha, h_alpha, sizeof(float) * m), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_F, h_F, sizeof(float) * m), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_SelfDotProd, SelfDotProd, sizeof(float) * m), 0)); + q_ct1.wait(); delete [] SelfDotProd; @@ -1058,13 +1061,14 @@ _kernelwidth*=-1; elapsed_kernel_time += (time_end - time_start)/1e9; #endif - mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks).wait(), 0)); - - mxCUDA_SAFE_CALL((q_ct1.memcpy(index_inter, d_index_inter, sizeof(int) * numBlocks).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(index_inter, d_index_inter, sizeof(int) * numBlocks), 0)); + q_ct1.wait(); CpuMaxInd(BIValue,BIIndex,value_inter,index_inter,numBlocks); - q_ct1.memcpy(&Fi, d_F + BIIndex, sizeof(float)).wait(); + q_ct1.memcpy(&Fi, d_F + BIIndex, sizeof(float)); + q_ct1.wait(); if (iter == (NUM_ITERATIONS - 1)) { @@ -1105,7 +1109,8 @@ _kernelwidth*=-1; - mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks), 0)); + q_ct1.wait(); CpuMin(SJValue,value_inter,numBlocks); @@ -1128,7 +1133,8 @@ _kernelwidth*=-1; d_KernelI=d_Kernel_Cache+CacheDiffI*m; - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_KernelInterRow, xT + BIIndex * n, n * sizeof(float)).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_KernelInterRow, xT + BIIndex * n, n * sizeof(float)), 0)); + q_ct1.wait(); RBFKernel(d_KernelI,BIIndex,d_x,d_KernelInterRow,d_KernelDotProd,d_SelfDotProd, m,n,nbrCtas,threadsPerCta, q_ct1, elapsed_kernel_time); @@ -1181,23 +1187,19 @@ _kernelwidth*=-1; - mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks).wait(), 0)); - - mxCUDA_SAFE_CALL((q_ct1.memcpy(index_inter, d_index_inter, sizeof(int) * numBlocks).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(value_inter, d_value_inter, sizeof(float) * numBlocks), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(index_inter, d_index_inter, sizeof(int) * numBlocks), 0)); + q_ct1.wait(); CpuMaxInd(BJSecondOrderValue,BJIndex,value_inter,index_inter,numBlocks); - mxCUDA_SAFE_CALL( (q_ct1.memcpy(&Kij, d_KernelI + BJIndex, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL( (q_ct1.memcpy(&alphai, d_alpha + BIIndex, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL( (q_ct1.memcpy(&alphaj, d_alpha + BJIndex, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL((q_ct1.memcpy(&yi, d_y + BIIndex, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL((q_ct1.memcpy(&yj, d_y + BJIndex, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL((q_ct1.memcpy(&Fj, d_F + BJIndex, sizeof(float)).wait(), 0)); + mxCUDA_SAFE_CALL( (q_ct1.memcpy(&Kij, d_KernelI + BJIndex, sizeof(float)), 0)); + mxCUDA_SAFE_CALL( (q_ct1.memcpy(&alphai, d_alpha + BIIndex, sizeof(float)), 0)); + mxCUDA_SAFE_CALL( (q_ct1.memcpy(&alphaj, d_alpha + BJIndex, sizeof(float)), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(&yi, d_y + BIIndex, sizeof(float)), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(&yj, d_y + BJIndex, sizeof(float)), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(&Fj, d_F + BJIndex, sizeof(float)), 0)); + q_ct1.wait(); oldalphai=alphai; @@ -1206,9 +1208,9 @@ _kernelwidth*=-1; UpdateAlphas(alphai,alphaj,Kij,yi,yj,Fi,Fj,_C,h_taumin); - mxCUDA_SAFE_CALL((q_ct1.memcpy(d_alpha + BIIndex, &alphai, sizeof(float)).wait(), 0)); - - mxCUDA_SAFE_CALL( (q_ct1.memcpy(d_alpha + BJIndex, &alphaj, sizeof(float)).wait(), 0)); + mxCUDA_SAFE_CALL((q_ct1.memcpy(d_alpha + BIIndex, &alphai, sizeof(float)), 0)); + mxCUDA_SAFE_CALL( (q_ct1.memcpy(d_alpha + BJIndex, &alphaj, sizeof(float)), 0)); + q_ct1.wait(); float deltaalphai = alphai - oldalphai; float deltaalphaj = alphaj - oldalphaj; @@ -1221,7 +1223,8 @@ _kernelwidth*=-1; d_KernelJ=d_Kernel_Cache+CacheDiffJ*m; - mxCUDA_SAFE_CALL( (q_ct1.memcpy(d_KernelInterRow, xT + BJIndex * n, n * sizeof(float)).wait(), 0)); + mxCUDA_SAFE_CALL( (q_ct1.memcpy(d_KernelInterRow, xT + BJIndex * n, n * sizeof(float)), 0)); + q_ct1.wait(); RBFKernel(d_KernelJ,BJIndex,d_x,d_KernelInterRow,d_KernelDotProd, d_SelfDotProd, m,n,nbrCtas,threadsPerCta, q_ct1, elapsed_kernel_time); *(KernelCacheIndices.begin()+CacheDiffJ)=BJIndex; @@ -1278,7 +1281,8 @@ _kernelwidth*=-1; //cout << "Average Kernel Time per run: " << (elapsed/NUM_OF_RUNS) << " sec.\n"; #endif - q_ct1.memcpy(mexalpha, d_alpha, m * sizeof(float)).wait(); + q_ct1.memcpy(mexalpha, d_alpha, m * sizeof(float)); + q_ct1.wait(); stop_ct1 = std::chrono::high_resolution_clock::now(); diff --git a/svm/SYCL/infrastructure/SYCL.cpp b/svm/SYCL/infrastructure/SYCL.cpp deleted file mode 100644 index 634d648a..00000000 --- a/svm/SYCL/infrastructure/SYCL.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* -MIT License - -Copyright (c) 2015 University of West Bohemia - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* -MIT License - -Modifications Copyright (C) 2023 Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -SPDX-License-Identifier: MIT License -*/ - - -#include "SYCL.h" -#include "Utilities.h" -#include -#include - -#define CPP_MODULE "SYCL" -#include "Logging.h" - -#ifdef USE_INFRASTRUCTURE // Move to test bench base?? -void SYCL::RegisterDeviceSettings(VelocityBench::CommandLineParser &cmdLineParser) -{ - cmdLineParser.AddSetting("--device_type", "SYCL device type to use", false, "", VelocityBench::CommandLineParser::InputType_t::STRING, 1 /* One operand */); - LOG("Registered SYCL device querying settings"); -} -#endif - -cl::sycl::device SYCL::QueryAndGetDevice(std::string const &sDeviceType) -{ - if (sDeviceType.empty()) { - LOG("Using default selector()"); - return cl::sycl::device(cl::sycl::device(cl::sycl::default_selector())); - } - - LOG("Attemping to use " << sDeviceType << "_selector()"); - std::string const sLowerCaseDeviceType(Utility::ToLowerCase(sDeviceType)); - if (sLowerCaseDeviceType == "cpu") - return cl::sycl::device(cl::sycl::cpu_selector()); - else if (sLowerCaseDeviceType == "gpu") - return cl::sycl::device(cl::sycl::gpu_selector()); - else if (sLowerCaseDeviceType == "nvidia") - return cl::sycl::device(nvidia_selector()); // Custom selector - else { - LOG_ERROR("Unknown device " << sDeviceType << "_selector()"); - } -} - -SYCL::SYCL(cl::sycl::device const &SelectedDevice) - : m_syclQueue(SelectedDevice, // Device selected - [](cl::sycl::exception_list ExceptionList) { // Async exception handler - try { - for (auto const &Exception : ExceptionList) { - std::rethrow_exception(Exception); - } - } catch (cl::sycl::exception e) { - LOG_ERROR("SYCL exception caught: " << e.what()); - }} -#ifdef ENABLE_KERNEL_PROFILING - , {cl::sycl::property::queue::enable_profiling()} // Setting profiling if we want to -#endif - ) - , m_syclEvent() -{ - LOG("SYCL Queue initialization successful"); - DisplayDeviceProperties(m_syclQueue.get_device()); -} - -void SYCL::DisplayDeviceProperties(cl::sycl::device const &Device) -{ - LOG("\t Using SYCL device : " << Device.get_info() << " (Driver version " << Device.get_info() << ")"); - LOG("\t Platform : " << Device.get_platform().get_info()); - LOG("\t Vendor : " << Device.get_info()); - LOG("\t Max compute units : " << Device.get_info()); -#ifdef ENABLE_KERNEL_PROFILING - LOG("\t Kernel profiling : enabled"); -#else - LOG("\t Kernel profiling : disabled"); -#endif - - std::vector const vLDDPaths(Utility::ExtractLDDPathNameFromProcess({"libOpenCL", "libsycl", "libComputeCpp", "libze"})); //[0] OCL, [1] Intel's SYCL, [2] ComputeCpp SYCL - if (vLDDPaths.empty()) { - LOG_WARNING("Unable to print OpenCL and SYCL dependent libraries! The LD_LIBRARY_PATH may be incorrectly set"); // Should not reach to this case - return; - } - - LOG("\t Using OpenCL library : " << (!vLDDPaths[0].empty() ? vLDDPaths[0] : "WARNING! OpenCL library not found!")); - - if (!vLDDPaths[1].empty()) { // Implies we are using Intel's DPC++ compiler - LOG("\t Using OneAPI SYCL library : " << vLDDPaths[1]); - LOG("\t Using Level Zero library : " << (!vLDDPaths[3].empty() ? vLDDPaths[3] : "WARNING! Level zero library not found! L0 backend may not be available!")); - } - - if (!vLDDPaths[2].empty()) - LOG("\t Using ComputeCPP library : " << vLDDPaths[2]); -} - -std::chrono::steady_clock::duration SYCL::ConvertKernelTimeToDuration(cl::sycl::event const &Event) -{ - std::chrono::time_point const tpStart (std::chrono::nanoseconds(Event.template get_profiling_info())); - std::chrono::time_point const tpFinish(std::chrono::nanoseconds(Event.template get_profiling_info())); - return std::chrono::steady_clock::duration(tpFinish - tpStart); -} diff --git a/svm/SYCL/infrastructure/SYCL.h b/svm/SYCL/infrastructure/SYCL.h deleted file mode 100644 index a78819e9..00000000 --- a/svm/SYCL/infrastructure/SYCL.h +++ /dev/null @@ -1,99 +0,0 @@ -/* -MIT License - -Copyright (c) 2015 University of West Bohemia - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -/* -MIT License - -Modifications Copyright (C) 2023 Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -SPDX-License-Identifier: MIT License -*/ - - -#ifndef SYCL_CLASS_H -#define SYCL_CLASS_H - -#include "CommandLineParser.h" - -#include -#include -#include - -class SYCL -{ -private: - cl::sycl::queue m_syclQueue; - cl::sycl::event m_syclEvent; -public: - // NVIDIA GPU selector - class nvidia_selector : public cl::sycl::device_selector - { - virtual int operator()(cl::sycl::device const &SYCLDevice) const override - { - if (!SYCLDevice.is_gpu()) - return -1; - - std::string const sVendor(SYCLDevice.get_info()); - return sVendor.find("NVIDIA") != std::string::npos ? 1000 : -1; - } - }; - - explicit SYCL(cl::sycl::device const &Device); - ~SYCL(){} - - SYCL (SYCL const &RHS) = delete; - SYCL &operator=(SYCL const &RHS) = delete; - - cl::sycl::queue &GetQueue() { return m_syclQueue; } - cl::sycl::event &GetEvent() { return m_syclEvent; } - - static void DisplayDeviceProperties(cl::sycl::device const &Device); - static cl::sycl::device QueryAndGetDevice (std::string const &sDeviceType); - - static std::chrono::steady_clock::duration ConvertKernelTimeToDuration(cl::sycl::event const &Event); - -#ifdef USE_INFRASTRUCTURE // Move to testbench base?? - static void RegisterDeviceSettings (VelocityBench::CommandLineParser &cmdLineParser); -#endif -}; - -#endif