From 493d9f793c0ed9f2d54d85c922536f7c13e5fe96 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Mon, 14 Apr 2025 05:02:15 +0000 Subject: [PATCH 1/5] CUDA/HIP: Share the same unified memory allocation logic. Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs. --- Makefile | 4 ---- docs/build.md | 6 ++++-- ggml/CMakeLists.txt | 1 - ggml/src/ggml-cuda/ggml-cuda.cu | 21 ++++++--------------- ggml/src/ggml-cuda/vendors/hip.h | 2 ++ ggml/src/ggml-hip/CMakeLists.txt | 4 ---- 6 files changed, 12 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index 1f9455eff0aec..772993ada2707 100644 --- a/Makefile +++ b/Makefile @@ -780,10 +780,6 @@ ifdef GGML_HIP MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA -ifdef GGML_HIP_UMA - MK_CPPFLAGS += -DGGML_HIP_UMA -endif # GGML_HIP_UMA - MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64 MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas diff --git a/docs/build.md b/docs/build.md index 3f1b043992545..c9027c0b580a5 100644 --- a/docs/build.md +++ b/docs/build.md @@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` - On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. - However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system. @@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. +### Unified Memory + +On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + ## Vulkan **Windows** diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d33f843b417cf..438c2a7309191 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP" option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF) option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON) option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF) -option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4af1897017567..cdd8d41c659b8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -96,31 +96,22 @@ int ggml_cuda_get_device() { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); -#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA) - auto res = hipMallocManaged(ptr, size); - if (res == hipSuccess) { - // if error we "need" to know why... - CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); - } - return res; -#else - -#if !defined(GGML_USE_HIP) cudaError_t err; if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) { err = cudaMallocManaged(ptr, size); +#if defined(GGML_USE_HIP) + if (err == hipSuccess) { + // if error we "need" to know why... + CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); + } +#endif // defined(GGML_USE_HIP) } else { err = cudaMalloc(ptr, size); } return err; -#else - return cudaMalloc(ptr, size); -#endif // !defined(GGML_USE_HIP) - -#endif } #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 420b41b8d652d..1a28831b7a96b 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -71,6 +71,8 @@ #define cudaLaunchHostFunc hipLaunchHostFunc #define cudaMalloc hipMalloc #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMallocManaged hipMallocManaged +#define cudaMemAdvise hipMemAdvise #define cudaMemcpy hipMemcpy #define cudaMemcpyAsync hipMemcpyAsync #define cudaMemcpyPeerAsync hipMemcpyPeerAsync diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index e3762649fd275..1fe8fe3b8d079 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -89,10 +89,6 @@ endif() add_compile_definitions(GGML_USE_HIP) -if (GGML_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) -endif() - if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() From 6fab0d4c6b4d316a9a3f21a722f893c1cb9c6d4d Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Mon, 14 Apr 2025 21:08:49 +0800 Subject: [PATCH 2/5] Remove comment --- ggml/src/ggml-cuda/ggml-cuda.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index cdd8d41c659b8..ff2573783e380 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -102,7 +102,6 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) err = cudaMallocManaged(ptr, size); #if defined(GGML_USE_HIP) if (err == hipSuccess) { - // if error we "need" to know why... CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); } #endif // defined(GGML_USE_HIP) From 2428d6726a57c7fb849d0859600fd6a2d043810a Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Mon, 14 Apr 2025 22:20:32 +0800 Subject: [PATCH 3/5] Add a fallback to hipMalloc() and print warning once when managed memory is not supported. --- ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ff2573783e380..912d26fd32acd 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -104,6 +104,17 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) if (err == hipSuccess) { CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device)); } + + // fall back to cudaMalloc if not supported (e.g. on Windows) + if (err == hipErrorNotSupported) { + static bool warnedUnsupported = false; + if (!warnedUnsupported) { + GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n"); + warnedUnsupported = true; + } + + err = cudaMalloc(ptr, size); + } #endif // defined(GGML_USE_HIP) } else From a3862d91fa7e170971a710144613fe506e9bc192 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Mon, 14 Apr 2025 22:22:08 +0800 Subject: [PATCH 4/5] Fix editorconfig check error --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 912d26fd32acd..d484fb57e8732 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -109,7 +109,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) if (err == hipErrorNotSupported) { static bool warnedUnsupported = false; if (!warnedUnsupported) { - GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n"); + GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n"); warnedUnsupported = true; } From e6956caeb9aae320176f96de722048fff6e060ea Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Mon, 14 Apr 2025 22:56:13 +0800 Subject: [PATCH 5/5] Use snake case variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d484fb57e8732..9ced466512788 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -107,10 +107,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) // fall back to cudaMalloc if not supported (e.g. on Windows) if (err == hipErrorNotSupported) { - static bool warnedUnsupported = false; - if (!warnedUnsupported) { + static bool warned_unsupported = false; + if (!warned_unsupported) { GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n"); - warnedUnsupported = true; + warned_unsupported = true; } err = cudaMalloc(ptr, size);