@@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
176
176
#define CU_CHECK (err ) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
177
177
#endif
178
178
179
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && !defined(GGML_USE_MUSA)
179
+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
180
180
# define CUDA_SET_SHARED_MEMORY_LIMIT (kernel, nbytes ) \
181
181
do { \
182
182
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
191
191
do { \
192
192
GGML_UNUSED (nbytes); \
193
193
} while (0 )
194
- #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
194
+ #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
195
195
196
196
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
197
197
#define GGML_CUDA_ASSUME (x ) __builtin_assume(x)
@@ -211,9 +211,9 @@ typedef float2 dfloat2;
211
211
#define GGML_USE_VMM
212
212
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
213
213
214
- #if ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
214
+ #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
215
215
#define FP16_AVAILABLE
216
- #endif // ( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
216
+ #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
217
217
218
218
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
219
219
#define FAST_FP16_AVAILABLE
@@ -227,17 +227,17 @@ typedef float2 dfloat2;
227
227
#define FP16_MMA_AVAILABLE
228
228
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
229
229
230
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined( CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
230
+ #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
231
231
#define AMD_MFMA_AVAILABLE
232
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__ ) && defined(CDNA3 )
232
+ #endif // defined(GGML_USE_HIP) && defined(CDNA3 ) && ! defined(GGML_HIP_NO_MMQ_MFMA )
233
233
234
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
234
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
235
235
#define NEW_MMA_AVAILABLE
236
- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
236
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
237
237
238
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
238
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
239
239
#define CP_ASYNC_AVAILABLE
240
- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
240
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
241
241
242
242
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
243
243
#define FLASH_ATTN_AVAILABLE
@@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
259
259
260
260
// Any FP16 tensor core instructions are available for ggml code.
261
261
static bool fp16_mma_available (const int cc) {
262
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
262
+ #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
263
263
return false ;
264
264
#else
265
265
if ((GGML_CUDA_CC_IS_NVIDIA (cc) && ggml_cuda_highest_compiled_arch (cc) >= GGML_CUDA_CC_VOLTA) ||
@@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
275
275
} else {
276
276
return false ;
277
277
}
278
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
278
+ #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
279
279
}
280
280
281
281
// To be used for feature selection of external libraries, e.g. cuBLAS.
@@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) {
312
312
}
313
313
314
314
static constexpr __device__ int ggml_cuda_get_physical_warp_size () {
315
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
315
+ #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
316
316
return 64 ;
317
317
#else
318
318
return 32 ;
319
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
319
+ #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
320
320
}
321
321
322
322
[[noreturn]]
323
323
static __device__ void no_device_code (
324
324
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
325
325
326
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
326
+ #if defined(GGML_USE_HIP)
327
327
printf (" %s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n " ,
328
328
file_name, line, function_name, arch);
329
329
GGML_UNUSED (arch_list);
330
330
#else
331
331
printf (" %s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n " ,
332
332
file_name, line, function_name, arch, arch_list);
333
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
333
+ #endif // defined(GGML_USE_HIP)
334
334
__trap ();
335
335
336
336
GGML_UNUSED (no_device_code); // suppress unused function warning
@@ -367,15 +367,15 @@ struct ggml_cuda_unroll<1> {
367
367
368
368
template <int width = WARP_SIZE>
369
369
static __device__ __forceinline__ int warp_reduce_sum (int x) {
370
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
370
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
371
371
return __reduce_add_sync (0xffffffff , x);
372
372
#else
373
373
#pragma unroll
374
374
for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
375
375
x += __shfl_xor_sync (0xffffffff , x, offset, width);
376
376
}
377
377
return x;
378
- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
378
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
379
379
}
380
380
381
381
template <int width = WARP_SIZE>
@@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
444
444
static __device__ __forceinline__ half ggml_cuda_hmax (const half a, const half b) {
445
445
#ifdef FP16_AVAILABLE
446
446
447
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
447
+ #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
448
448
return __float2half (fmaxf (__half2float (a), __half2float (b)));
449
449
#else
450
450
return __hmax (a, b);
451
- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && CUDART_VERSION < CUDART_HMAX
451
+ #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
452
452
453
453
#else
454
454
NO_DEVICE_CODE;
@@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
476
476
477
477
template <int width = WARP_SIZE>
478
478
static __device__ __forceinline__ half2 warp_reduce_max (half2 x) {
479
- #if !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
479
+ #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
480
480
#pragma unroll
481
481
for (int offset = width/2 ; offset > 0 ; offset >>= 1 ) {
482
482
x = ggml_cuda_hmax2 (x, __shfl_xor_sync (0xffffffff , x, offset, width));
@@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
485
485
#else
486
486
GGML_UNUSED (x);
487
487
NO_DEVICE_CODE;
488
- #endif // !( defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) ) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
488
+ #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
489
489
}
490
490
491
491
#if CUDART_VERSION < CUDART_HMASK
@@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
497
497
#endif // CUDART_VERSION < CUDART_HMASK
498
498
499
499
static __device__ __forceinline__ int ggml_cuda_dp4a (const int a, const int b, int c) {
500
- #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
500
+ #if defined(GGML_USE_HIP)
501
501
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
502
502
c = __builtin_amdgcn_sdot4 (a, b, c, false );
503
503
#elif defined(RDNA3) || defined(RDNA4)
@@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
523
523
#endif
524
524
return c;
525
525
526
- #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
526
+ #else // defined(GGML_USE_HIP)
527
527
528
528
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
529
529
return __dp4a (a, b, c);
@@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
533
533
return c + a8[0 ]*b8[0 ] + a8[1 ]*b8[1 ] + a8[2 ]*b8[2 ] + a8[3 ]*b8[3 ];
534
534
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
535
535
536
- #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
536
+ #endif // defined(GGML_USE_HIP)
537
537
}
538
538
539
539
typedef void (*dequantize_kernel_t )(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
0 commit comments