Skip to content

Commit b186285

Browse files
JohannesGaesslerjordankanter
authored andcommitted
CUDA: fixed tensor cores not being used on RDNA3 (ggml-org#4697)
1 parent 734df81 commit b186285

File tree

1 file changed

+24
-23
lines changed

1 file changed

+24
-23
lines changed

ggml-cuda.cu

+24-23
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,29 @@
119119
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
120120
#define CC_VOLTA 700
121121
#define CC_OFFSET_AMD 1000000
122+
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
122123
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
124+
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
123125

124126
#define GGML_CUDA_MAX_NODES 8192
125127

128+
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
129+
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
130+
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
131+
// - 7B quantum model: +100-200 MB
132+
// - 13B quantum model: +200-400 MB
133+
//
134+
//#define GGML_CUDA_FORCE_MMQ
135+
136+
// TODO: improve this to be correct for more hardware
137+
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
138+
#if !defined(GGML_CUDA_FORCE_MMQ)
139+
#define CUDA_USE_TENSOR_CORES
140+
#endif
141+
142+
// max batch size to use MMQ kernels when tensor cores are available
143+
#define MMQ_MAX_BATCH_SIZE 32
144+
126145
#if defined(GGML_USE_HIPBLAS)
127146
#define __CUDA_ARCH__ 1300
128147

@@ -189,23 +208,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
189208
}
190209
#endif // defined(GGML_USE_HIPBLAS)
191210

192-
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
193-
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
194-
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
195-
// - 7B quantum model: +100-200 MB
196-
// - 13B quantum model: +200-400 MB
197-
//
198-
//#define GGML_CUDA_FORCE_MMQ
199-
200-
// TODO: improve this to be correct for more hardware
201-
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
202-
#if !defined(GGML_CUDA_FORCE_MMQ) && (!defined(GGML_USE_HIPBLAS) || defined(RDNA3))
203-
#define CUDA_USE_TENSOR_CORES
204-
#endif
205-
206-
// max batch size to use MMQ kernels when tensor cores are available
207-
#define MMQ_MAX_BATCH_SIZE 32
208-
209211
#if defined(_MSC_VER)
210212
#pragma warning(disable: 4244 4267) // possible loss of data
211213
#endif
@@ -8661,13 +8663,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
86618663
}
86628664

86638665
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8664-
const bool fp16_performance_good = true;
86658666

8666-
#ifdef RDNA3
8667-
const bool use_mul_mat_q = false;
8668-
#else
8669-
const bool use_mul_mat_q = true;
8670-
#endif // RDNA3
8667+
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
8668+
bool use_mul_mat_q = ggml_is_quantized(src0->type);
8669+
#ifdef CUDA_USE_TENSOR_CORES
8670+
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
8671+
#endif // CUDA_USE_TENSOR_CORES
86718672

86728673
#else
86738674

0 commit comments

Comments
 (0)