Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit dceba67

Browse files
[SYCL] Support SYCL layer for LLaMA2 model (#272)
* fixed all UTs * move sycl benchmark to benchmark project * add q4 UT for sycl prologue_b * sycl gemv case * add UT case * add to trans storage conversion * add sycl context to model context. compile ne_layers with dpcpp * add context sycl memory allocation * inference with data exchange * use backend instead of ne type * new api to assign sycl buffer * add backend parameter for new tensor * add sycl int4 to graph compute * fix sync * compile without sycl * sync with main * fix tensor size bug * refactor layer config * sync ut * support 2 layers of sycl * sync main * revert ISA detect for dpcpp * compile without dpcpp * fix avxvnni intrin code * protect crash if it's a CPU SYCL device * add device mul function * fix the sync issue * run model with all FFN layers on SYCL * fix compile * clang-format * revert model config * fix fun ret * fix the kernel bug * remove all grad tensors. * fix some bugs. * support llama shapes, add new UT case, update new api of dpcpp * support all ffn layers * add sync for CPU Device * clang-format * fix warning * clang-format * add back f32 model support * fix typo, remove unused code * bring more layers to SYCL * add embedding support and use omp in sycl * optimize gemv k iteration * optimize rms_norm, add debug macro for no-mha forward. * add mha ut * prepare for SYCL MHA * add SYCL rope * all device f32 mha * remove unused code * fixed * fixed * refactor sycl context for multiple allocation * support n_gpu_layer * reuse scratch * add new mha version * new version of MHA * lower malloc size * compile without sycl * run llama without sycl build * clang-format * fix clang-tidy * fix py build * fix warning * use std header * update math * update math * revert scratch without SYCL * use cl for c_compiler * compile on linux * Revert "compile on linux" This reverts commit 0ce1574. * Revert "use cl for c_compiler" This reverts commit 7f40ae9. * fix memory leak, set lower extra memory size. * revert embedding size on CPU * clang-format --------- Co-authored-by: luoyu-intel <[email protected]>
1 parent 9652017 commit dceba67

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+3733
-1266
lines changed

CMakeLists.txt

+6
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI"
6161
option(NS_FMA "neural_speed: enable FMA" ON)
6262
option(NS_AMX "neural_speed: enable AMX" OFF)
6363
option(NS_USE_OMP "neural_speed: use OpenMP thread pool." ON)
64+
option(NS_SYCL "neural_speed: enable SYCL for GPUs." OFF)
6465

6566
option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
6667
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
@@ -143,6 +144,11 @@ if(NS_USE_OMP)
143144
add_compile_definitions(NS_USE_OMP)
144145
endif()
145146

147+
if(NS_SYCL)
148+
set(BTLA_SYCL ON CACHE BOOL "BesTLA with SYCL")
149+
add_compile_definitions(NS_SYCL)
150+
endif()
151+
146152
add_subdirectory(bestla)
147153

148154
add_subdirectory(neural_speed)

CMakePresets.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,10 @@
138138
"CMAKE_BUILD_TYPE": "Debug",
139139
"BTLA_UT_DEBUG": "ON",
140140
"BTLA_UT_ALL": "OFF",
141-
"BTLA_SYCL": "ON",
141+
"NS_SYCL": "ON",
142142
"BTLA_UT_BENCHMARK": "ON",
143+
"BTLA_UT_OPENMP": "ON",
144+
"BTLA_ENABLE_OPENMP": "ON",
143145
"CMAKE_CXX_COMPILER": "icx",
144146
"CMAKE_C_COMPILER": "icx"
145147
}

bestla/CMakeLists.txt

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
cmake_minimum_required(VERSION 3.12)
22
project(bestla LANGUAGES CXX VERSION 0.1.0)
33

4+
if(BTLA_SYCL)
5+
include(cmake/sycl.cmake)
6+
endif()
47
include(cmake/FindSIMD.cmake)
58

69
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
@@ -55,11 +58,11 @@ endforeach()
5558
set(sycl_headers)
5659
set(sycl_libs)
5760
if(BTLA_SYCL)
58-
include(cmake/sycl.cmake)
5961
file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
6062
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_SYCL)
6163
list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
62-
add_compile_options(-march=native)
64+
target_compile_options(${PROJECT_NAME} INTERFACE -march=native)
65+
target_link_libraries(${PROJECT_NAME} INTERFACE ${sycl_libs})
6366
#add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
6467
endif(BTLA_SYCL)
6568

@@ -103,7 +106,7 @@ function(add_ut_flag UT_OPTION)
103106
endfunction()
104107

105108
set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
106-
# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
109+
list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
107110

108111

109112
if(UT_BUILD)
@@ -150,6 +153,9 @@ endif(UT_BUILD)
150153
if(BTLA_UT_BENCHMARK)
151154
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
152155
include_directories(${PROJECT_NAME})
156+
if(NOT BTLA_SYCL)
157+
list(REMOVE_ITEM benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
158+
endif()
153159
add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
154160
if(BTLA_UT_OPENMP)
155161
include(FindOpenMP)

bestla/bestla/bestla_prologue_b.h

+22
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,28 @@ class WeightKBlockNInteger {
126126
return tmp;
127127
}
128128

129+
AUTOCALL void convertTransStorage(StorageWeight& srcstor, StorageWeight& dststor, parallel::IThreading* threading) {
130+
auto s8buf = utils::amalloc<int8_t>((size_t)srcstor.mK * srcstor.mN);
131+
auto s8transbuf = utils::amalloc<int8_t>((size_t)srcstor.mKPad * srcstor.mNPad);
132+
unpackWeight(srcstor.mN, srcstor.mK, &srcstor, s8buf, srcstor.mN, threading);
133+
transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
134+
compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
135+
threading);
136+
if (s8buf) {
137+
utils::afree(s8buf);
138+
}
139+
if (s8transbuf) {
140+
utils::afree(s8transbuf);
141+
}
142+
int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
143+
if (srcstor.mCorrection.mScaEleSize == 4) {
144+
transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
145+
dststor.template SPtr<float>(), dststor.CStep(), threading);
146+
} else if (srcstor.mCorrection.mScaEleSize == 2) {
147+
transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
148+
dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
149+
}
150+
}
129151
AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
130152
utils::aligned_vector<float>* dq_buf) {
131153
if (qtype == BTLA_DTYPE::DQ8_BNB) {

bestla/bestla/bestla_storage.h

+16
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,22 @@ class StorageWeightKBlockNInteger : public IWeightKBlockBase {
706706
mPrologueID = BTLA_PROLOGUEB_IDS::WeightKBlockNInteger;
707707
}
708708

709+
StorageWeightKBlockNInteger toTrans() {
710+
StorageWeightKBlockNInteger trans(-1);
711+
trans.mK = mK;
712+
trans.mN = mN;
713+
trans.mNPad = mNPad;
714+
trans.mKPad = mKPad;
715+
trans.mBlockSize = mBlockSize;
716+
trans.mDType = mDType;
717+
trans.mQBuf.resize(mQBuf.size<int8_t>());
718+
int nk_scale = utils::updiv(mKPad, mBlockSize);
719+
trans.mCorrection.resize(mNPad, nk_scale, mCorrection.mScaT, mCorrection.mZpT, mCorrection.mRedT,
720+
mCorrection.mZpBuf.size<int>() > 0, mCorrection.mRedBuf.size<int>() > 0);
721+
trans.update_size();
722+
return trans;
723+
}
724+
709725
size_t resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE qtype, BTLA_DTYPE scalet, BTLA_DTYPE redt,
710726
bool IsAsym) {
711727
BTLA_DTYPE zpt = BTLA_DTYPE::S8;

bestla/bestla/bestla_utils.h

+14
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@
6262

6363
// As long as the compiler supports the ISA, we will enable it.
6464
// Only the ISA you use in your project will be compiled.
65+
#if defined(_MSC_VER) && defined(__INTEL_LLVM_COMPILER)
66+
#define CompileAVX512F() defined(__AVX512F__)
67+
#define CompileAVX512VNNI() defined(__AVX512VNNI__)
68+
#define CompileAVX2() defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
69+
#define CompileAVXVNNI() defined(__AVXVNNI__)
70+
#define CompileAMX() defined(__AMX_TILE__)
71+
#define CompileBF16() defined(__AVX512BF16__)
72+
#define CompileFP16() defined(__AVX512FP16__)
73+
#define CompileAMXBF16() (CompileAMX())
74+
#define CompileAMXINT8() (CompileAMX())
75+
#else
6576
#define CompileAVX512F() BTLA_AVX512_FOUND
6677
#define CompileAVX512VNNI() BTLA_AVX512_VNNI_FOUND
6778
#define CompileAVX2() BTLA_AVX2_FOUND
@@ -72,6 +83,7 @@
7283
#define CompileAMXFP16() BTLA_AMX_FP16_FOUND
7384
#define CompileAMXINT8() BTLA_AMX_INT8_FOUND
7485
#define CompileAMX() BTLA_AMX_BF16_FOUND
86+
#endif
7587

7688
// called by launcher, time critical functions
7789
#define TLACALL \
@@ -475,6 +487,8 @@ class isa_base {
475487

476488
static inline int padto_le(int src, int padding) { return src / padding * padding; }
477489

490+
static inline int64_t padto_le(int64_t src, int64_t padding) { return src / padding * padding; }
491+
478492
static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
479493

480494
static inline int updiv(int a, int b) { return (a + b - 1) / b; }

bestla/bestla/kernel_avx2.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ namespace avx2 {
2525
#pragma GCC push_options
2626
#pragma GCC target("avx2", "fma", "f16c")
2727
#elif defined(ICX)
28-
#pragma clang attribute push(__attribute__((target("avx,avx2,fma"))), apply_to = function)
28+
//#pragma clang attribute push(__attribute__((target("avx2,fma,f16c"))), apply_to = function)
2929
#endif
3030

3131
static inline void zero_reg() { _mm256_zeroupper(); }

bestla/bestla/sycl/sycl_device.h

+64-14
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ namespace sycl_device {
2323

2424
class SyclDevice {
2525
public:
26-
SyclDevice(bool profile) {
26+
SyclDevice(int gpu_idx, bool profile) {
2727
// Create an exception handler for asynchronous SYCL exceptions
2828
static auto exception_handler = [](sycl::exception_list e_list) {
2929
for (std::exception_ptr const& e : e_list) {
@@ -37,12 +37,38 @@ class SyclDevice {
3737
}
3838
}
3939
};
40+
auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
41+
assert(gpu_idx < devices.size());
42+
43+
if (profile) {
44+
sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
45+
mQueue = sycl::queue(devices[gpu_idx], exception_handler, prop);
46+
} else {
47+
sycl::property_list prop = {sycl::property::queue::in_order()};
48+
mQueue = sycl::queue(devices[gpu_idx], exception_handler);
49+
}
50+
}
4051

52+
SyclDevice(bool profile) {
53+
// Create an exception handler for asynchronous SYCL exceptions
54+
static auto exception_handler = [](sycl::exception_list e_list) {
55+
for (std::exception_ptr const& e : e_list) {
56+
try {
57+
std::rethrow_exception(e);
58+
} catch (std::exception const& e) {
59+
#if _DEBUG
60+
std::cout << "Failure" << std::endl;
61+
#endif
62+
std::terminate();
63+
}
64+
}
65+
};
4166
auto d_selector{sycl::default_selector_v};
4267
if (profile) {
43-
sycl::property_list prop = {sycl::property::queue::enable_profiling()};
68+
sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
4469
mQueue = sycl::queue(d_selector, exception_handler, prop);
4570
} else {
71+
sycl::property_list prop = {sycl::property::queue::in_order()};
4672
mQueue = sycl::queue(d_selector, exception_handler);
4773
}
4874
}
@@ -51,20 +77,44 @@ class SyclDevice {
5177

5278
inline std::string getName() { return mQueue.get_device().get_info<sycl::info::device::name>(); };
5379

80+
size_t getGlobalMemSize() { return mQueue.get_device().get_info<sycl::info::device::global_mem_size>(); }
81+
size_t getMaxMemAllocSize() { return mQueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>(); }
82+
83+
double getGlobalMemSizeGB() { return double(getGlobalMemSize()) / 1e9; }
84+
double getMaxMemAllocSizeMB() { return double(getGlobalMemSize()) / 1e6; }
85+
86+
static inline bool is_cpu(const sycl::device& dev) {
87+
return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
88+
}
89+
90+
static inline bool is_gpu(const sycl::device& dev) {
91+
return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
92+
}
93+
94+
static inline bool is_cpu(sycl::queue* q) {
95+
return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
96+
}
97+
98+
static inline bool is_gpu(sycl::queue* q) {
99+
return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
100+
}
101+
54102
void print() {
55103
std::cout << "Running on device: " << mQueue.get_device().get_info<sycl::info::device::name>() << "\n";
56-
std::cout << "EU count:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count>()
57-
<< "\n"; // 448
58-
std::cout << "EU count per subslice:"
59-
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>() << "\n"; // 8
60-
std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>()
61-
<< "\n"; // 8
62-
std::cout << "HW threads per EU:"
63-
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>() << "\n"; // 8
64-
std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_slices>()
65-
<< "\n"; // 7
66-
std::cout << "Subslice per slice:"
67-
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_subslices_per_slice>() << "\n"; // 8
104+
if (is_gpu(mQueue.get_device())) {
105+
std::cout << "EU count:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count>() << "\n";
106+
std::cout << "EU count per subslice:"
107+
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>() << "\n";
108+
std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>()
109+
<< "\n";
110+
std::cout << "HW threads per EU:"
111+
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>() << "\n";
112+
std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_slices>() << "\n";
113+
std::cout << "Subslice per slice:"
114+
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>() << "\n";
115+
}
116+
std::cout << "Global Memory size: " << getGlobalMemSizeGB() << "\n";
117+
std::cout << "Global Memory size: " << getMaxMemAllocSize() << "\n";
68118
}
69119
sycl::queue mQueue;
70120
};

bestla/bestla/sycl/sycl_gemm.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#ifdef BTLA_SYCL
1717
#include <array>
1818

19-
#include "bestla_utils.h"
19+
#include "bestla/bestla_utils.h"
2020
#include <sycl/sycl.hpp>
2121

2222
namespace bestla {
@@ -64,6 +64,17 @@ class SGemmCoreSharedB {
6464

6565
using SLM_B_Acc = sycl::local_accessor<TB, 1>;
6666

67+
using AType = TA;
68+
using BType = TB;
69+
using CType = TC;
70+
static auto constexpr NTILE = WgNEle;
71+
static auto constexpr MTILE = WgMEle;
72+
static auto constexpr KTILE = TileK;
73+
static auto constexpr PACK_ROW = 1;
74+
static int constexpr PREFERRED_N = NTILE;
75+
static auto constexpr ISA = BTLA_ISA::ISA_COUNT;
76+
static auto constexpr ID = 0;
77+
6778
static inline void compute(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
6879
const sycl_utils::nd_item_helper<SGemmCoreSharedB<ConfigT>>& helper) {
6980
#pragma unroll(1)

bestla/bestla/sycl/sycl_prologue_a.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#ifdef BTLA_SYCL
1717
#include <array>
1818

19-
#include "bestla_utils.h"
19+
#include "bestla/bestla_utils.h"
2020
#include <sycl/sycl.hpp>
2121

2222
namespace bestla {

0 commit comments

Comments
 (0)