From 493d9f793c0ed9f2d54d85c922536f7c13e5fe96 Mon Sep 17 00:00:00 2001
From: David Huang <1969802+hjc4869@users.noreply.github.com>
Date: Mon, 14 Apr 2025 05:02:15 +0000
Subject: [PATCH 1/5] CUDA/HIP: Share the same unified memory allocation logic.

Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs.
---
 Makefile                         |  4 ----
 docs/build.md                    |  6 ++++--
 ggml/CMakeLists.txt              |  1 -
 ggml/src/ggml-cuda/ggml-cuda.cu  | 21 ++++++---------------
 ggml/src/ggml-cuda/vendors/hip.h |  2 ++
 ggml/src/ggml-hip/CMakeLists.txt |  4 ----
 6 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index 1f9455eff0aec..772993ada2707 100644
--- a/Makefile
+++ b/Makefile
@@ -780,10 +780,6 @@ ifdef GGML_HIP
 
 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 
-ifdef GGML_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # GGML_HIP_UMA
-
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
diff --git a/docs/build.md b/docs/build.md
index 3f1b043992545..c9027c0b580a5 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
       cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
       && cmake --build build --config Release -- -j 16
   ```
-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
 
   To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
 
@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 
+### Unified Memory
+
+On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
+
 ## Vulkan
 
 **Windows**
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index d33f843b417cf..438c2a7309191 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -170,7 +170,6 @@ option(GGML_HIP                             "ggml: use HIP"
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 4af1897017567..cdd8d41c659b8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -96,31 +96,22 @@ int ggml_cuda_get_device() {
 
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
-#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
-    auto res = hipMallocManaged(ptr, size);
-    if (res == hipSuccess) {
-        // if error we "need" to know why...
-        CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
-    }
-    return res;
-#else
-
-#if !defined(GGML_USE_HIP)
     cudaError_t err;
     if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
     {
         err = cudaMallocManaged(ptr, size);
+#if defined(GGML_USE_HIP)
+        if (err == hipSuccess) {
+            // if error we "need" to know why...
+            CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
+        }
+#endif // defined(GGML_USE_HIP)
     }
     else
     {
         err = cudaMalloc(ptr, size);
     }
     return err;
-#else
-    return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIP)
-
-#endif
 }
 
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 420b41b8d652d..1a28831b7a96b 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -71,6 +71,8 @@
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAdvise hipMemAdvise
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index e3762649fd275..1fe8fe3b8d079 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -89,10 +89,6 @@ endif()
 
 add_compile_definitions(GGML_USE_HIP)
 
-if (GGML_HIP_UMA)
-    add_compile_definitions(GGML_HIP_UMA)
-endif()
-
 if (GGML_CUDA_FORCE_MMQ)
     add_compile_definitions(GGML_CUDA_FORCE_MMQ)
 endif()

From 6fab0d4c6b4d316a9a3f21a722f893c1cb9c6d4d Mon Sep 17 00:00:00 2001
From: David Huang <1969802+hjc4869@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:08:49 +0800
Subject: [PATCH 2/5] Remove comment

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index cdd8d41c659b8..ff2573783e380 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -102,7 +102,6 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
         err = cudaMallocManaged(ptr, size);
 #if defined(GGML_USE_HIP)
         if (err == hipSuccess) {
-            // if error we "need" to know why...
             CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
         }
 #endif // defined(GGML_USE_HIP)

From 2428d6726a57c7fb849d0859600fd6a2d043810a Mon Sep 17 00:00:00 2001
From: David Huang <1969802+hjc4869@users.noreply.github.com>
Date: Mon, 14 Apr 2025 22:20:32 +0800
Subject: [PATCH 3/5] Add a fallback to hipMalloc() and print warning once when
 managed memory is not supported.

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ff2573783e380..912d26fd32acd 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -104,6 +104,17 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
         if (err == hipSuccess) {
             CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
         }
+
+        // fall back to cudaMalloc if not supported (e.g. on Windows)
+        if (err == hipErrorNotSupported) {
+            static bool warnedUnsupported = false;
+            if (!warnedUnsupported) {
+                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");                
+                warnedUnsupported = true;
+            }
+
+            err = cudaMalloc(ptr, size);
+        }
 #endif // defined(GGML_USE_HIP)
     }
     else

From a3862d91fa7e170971a710144613fe506e9bc192 Mon Sep 17 00:00:00 2001
From: David Huang <1969802+hjc4869@users.noreply.github.com>
Date: Mon, 14 Apr 2025 22:22:08 +0800
Subject: [PATCH 4/5] Fix editorconfig check error

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 912d26fd32acd..d484fb57e8732 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -109,7 +109,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
         if (err == hipErrorNotSupported) {
             static bool warnedUnsupported = false;
             if (!warnedUnsupported) {
-                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");                
+                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
                 warnedUnsupported = true;
             }
 

From e6956caeb9aae320176f96de722048fff6e060ea Mon Sep 17 00:00:00 2001
From: David Huang <1969802+hjc4869@users.noreply.github.com>
Date: Mon, 14 Apr 2025 22:56:13 +0800
Subject: [PATCH 5/5] Use snake case variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d484fb57e8732..9ced466512788 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -107,10 +107,10 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 
         // fall back to cudaMalloc if not supported (e.g. on Windows)
         if (err == hipErrorNotSupported) {
-            static bool warnedUnsupported = false;
-            if (!warnedUnsupported) {
+            static bool warned_unsupported = false;
+            if (!warned_unsupported) {
                 GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
-                warnedUnsupported = true;
+                warned_unsupported = true;
             }
 
             err = cudaMalloc(ptr, size);