Skip to content

Commit 66906cd

Browse files
authored
HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 (ggml-org#14624)
This commit adds support for MFMA instructions to MMQ. CDNA1/GFX908 CDNA2/GFX90a and CDNA3/GFX942 are supported by the MFMA-enabled code path added by this commit. The code path and stream-k is only enabled on CDNA3 for now as it fails to outperform blas in all cases on the other devices. Blas is currently only consistently outperformed on CDNA3 due to issues in the amd-provided blas libraries. This commit also improves the awareness of MMQ towards different warp sizes and as a side effect improves the performance of all quant formats besides q4_0 and q4_1, which regress slightly, on GCN gpus.
1 parent 11dd5a4 commit 66906cd

File tree

6 files changed

+1297
-702
lines changed

6 files changed

+1297
-702
lines changed

.devops/rocm.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
ARG UBUNTU_VERSION=24.04
22

33
# This needs to generally match the container host's environment.
4-
ARG ROCM_VERSION=6.3
5-
ARG AMDGPU_VERSION=6.3
4+
ARG ROCM_VERSION=6.4
5+
ARG AMDGPU_VERSION=6.4
66

77
# Target the CUDA build image
88
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

ggml/src/ggml-cuda/common.cuh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
5757
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
5858
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
59-
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
59+
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
6060
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
6161
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
6262

@@ -72,8 +72,9 @@
7272
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
7373
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
7474
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75-
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
76-
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
75+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
76+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
77+
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
7778

7879
// Moore Threads
7980
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
226227
#define FP16_MMA_AVAILABLE
227228
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
228229

230+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
231+
#define AMD_MFMA_AVAILABLE
232+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
233+
229234
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
230235
#define NEW_MMA_AVAILABLE
231236
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
288293
return GGML_CUDA_CC_IS_CDNA(cc);
289294
}
290295

296+
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
297+
static bool amd_mfma_available(const int cc) {
298+
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
299+
}
300+
291301
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
292302
static bool new_mma_available(const int cc) {
293303
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;

ggml/src/ggml-cuda/mma.cuh

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
// The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
1313
// All matrix tiles have ne physical 32 bit elements per warp.
1414
//
15-
// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
15+
// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
16+
// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
1617

1718
#include "common.cuh"
1819

@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
6667
struct tile {
6768
static constexpr int I = I_;
6869
static constexpr int J = J_;
69-
static constexpr int ne = I * J / WARP_SIZE;
70+
71+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
72+
static constexpr int ne = I * J / 64;
73+
T x[ne] = {0};
74+
75+
static __device__ __forceinline__ int get_i(const int l) {
76+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
77+
return threadIdx.x % 16;
78+
} else if constexpr (I == 16 && J == 8) {
79+
return threadIdx.x % 16;
80+
} else if constexpr (I == 32 && J == 4) {
81+
return threadIdx.x % 32;
82+
} else if constexpr (I == 16 && J == 16) {
83+
return 4 * (threadIdx.x / 16) + l;
84+
} else if constexpr (I == 32 && J == 32) {
85+
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
86+
} else {
87+
static_assert(I == -1 && J == -1, "template specialization not implemented");
88+
}
89+
}
90+
91+
static __device__ __forceinline__ int get_j(const int l) {
92+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
93+
return (2 * ((threadIdx.x / 16) % 2) + l);
94+
} else if constexpr (I == 16 && J == 8) {
95+
return 2 * (threadIdx.x / 16) + l;
96+
} else if constexpr (I == 32 && J == 4) {
97+
return 2 * (threadIdx.x / 32) + l;
98+
} else if constexpr (I == 16 && J == 16) {
99+
return threadIdx.x % 16;
100+
} else if constexpr (I == 32 && J == 32) {
101+
return threadIdx.x % 32;
102+
} else {
103+
static_assert(I == -1 && J == -1, "template specialization not implemented");
104+
}
105+
}
106+
#else
107+
static constexpr int ne = I * J / 32;
70108
T x[ne] = {0};
71109

72110
static __device__ __forceinline__ int get_i(const int l) {
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
94132
static_assert(I == -1 && J == -1, "template specialization not implemented");
95133
}
96134
}
135+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
97136
};
98137

99138
template <int I_, int J_>
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
148187

149188
template <int I, int J, typename T>
150189
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
190+
#if defined(AMD_MFMA_AVAILABLE)
191+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
192+
#pragma unroll
193+
for (int l = 0; l < t.ne; ++l) {
194+
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
195+
}
196+
} else {
197+
int64_t * xi = (int64_t *) t.x;
198+
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
199+
xi[0] = xs[0];
200+
}
201+
#else
151202
#pragma unroll
152203
for (int l = 0; l < t.ne; ++l) {
153204
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
154205
}
206+
#endif // defined(AMD_MFMA_AVAILABLE)
155207
}
156208

157209
template <typename T>
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
186238
template <typename T>
187239
static __device__ __forceinline__ void load_ldmatrix(
188240
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
189-
#ifdef NEW_MMA_AVAILABLE
241+
#if defined(NEW_MMA_AVAILABLE)
190242
int * xi = (int * ) t.x;
191243
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
192244
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
393445
NO_DEVICE_CODE;
394446
#endif // NEW_MMA_AVAILABLE
395447
}
448+
449+
static __device__ __forceinline__ void mma(
450+
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
451+
#if defined(AMD_MFMA_AVAILABLE)
452+
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
453+
int32x4_t * acc = (int32x4_t *) D.x;
454+
#if defined(CDNA3)
455+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
456+
((int64_t *) B.x)[0],
457+
acc[0],
458+
0, 0, 0);
459+
#elif defined(CDNA2) || defined(CDNA)
460+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
461+
B.x[0],
462+
acc[0],
463+
0, 0, 0);
464+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
465+
B.x[1],
466+
acc[0],
467+
0, 0, 0);
468+
#endif // defined(CDNA3)
469+
#else
470+
GGML_UNUSED(D);
471+
GGML_UNUSED(A);
472+
GGML_UNUSED(B);
473+
NO_DEVICE_CODE;
474+
#endif // AMD_MFMA_AVAILABLE
475+
}
476+
477+
static __device__ __forceinline__ void mma(
478+
tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
479+
#if defined(AMD_MFMA_AVAILABLE)
480+
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
481+
int32x16_t * acc = (int32x16_t *) D.x;
482+
#if defined(CDNA3)
483+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
484+
((int64_t *) B.x)[0],
485+
acc[0],
486+
0, 0, 0);
487+
#elif defined(CDNA2) || defined(CDNA)
488+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
489+
B.x[0],
490+
acc[0],
491+
0, 0, 0);
492+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
493+
B.x[1],
494+
acc[0],
495+
0, 0, 0);
496+
#endif // defined(CDNA3)
497+
#else
498+
GGML_UNUSED(D);
499+
GGML_UNUSED(A);
500+
GGML_UNUSED(B);
501+
NO_DEVICE_CODE;
502+
#endif // AMD_MFMA_AVAILABLE
503+
}
396504
}

ggml/src/ggml-cuda/mmq.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
109109
const int64_t s03 = src0->nb[3] / ts_src0;
110110
const int64_t s3 = dst->nb[3] / ts_dst;
111111

112-
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
112+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
113114

114115
if (!ids) {
115116
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
250251
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
251252
// Also its fixup needs to allocate a temporary buffer in the memory pool.
252253
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
253-
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
254-
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
254+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
256+
&& src1_ncols == ne11;
255257
const mmq_args args = {
256258
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
257259
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
@@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
304306
return false;
305307
}
306308

307-
if (new_mma_available(cc)) {
309+
if (new_mma_available(cc) || amd_mfma_available(cc)) {
308310
return true;
309311
}
310312

0 commit comments

Comments
 (0)