From 14ac344f5e005085a397d9739f5508e4f631edf3 Mon Sep 17 00:00:00 2001
From: dou <15529241576@163.com>
Date: Thu, 10 Apr 2025 07:24:11 +0000
Subject: [PATCH 1/3] feat: Increase the way memory allocation is managed

---
 ggml/src/ggml-cann/aclnn_ops.cpp |   7 +-
 ggml/src/ggml-cann/aclnn_ops.h   |  22 +-
 ggml/src/ggml-cann/ggml-cann.cpp | 405 +++++++++++++++++++++++++------
 3 files changed, 354 insertions(+), 80 deletions(-)
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 37d4117972358..4357c9a2948ec 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1209,6 +1209,11 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     GGML_CANN_CALL_ACLNN_OP(Sin, acl_src, acl_dst);
 }
 
+void aclnn_geluv2(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                       aclTensor* acl_dst) {
+        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
+}
+
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     const ggml_tensor* src = dst->src[0];
@@ -1783,7 +1788,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
                 GGML_MAX_DIMS + 1);
             aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
+                src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
                 GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
             aclTensor* dequant_tensor = ggml_cann_create_tensor(
                 dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index b2d1b3c36d238..2669debe43dc1 100644
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -593,6 +593,9 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     aclTensor* acl_dst);
 
+void aclnn_geluv2(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+    aclTensor* acl_dst);
+
 /**
  * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
  * output tensor.
@@ -840,14 +843,13 @@ void ggml_cann_unary_op(
  * @see ggml_cann_unary_op
  * @see GGML_CANN_CALL_ACLNN_OP
  */
-#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
-    do {                                                         \
-        auto lambda = [](ggml_backend_cann_context& ctx,         \
-            aclTensor* acl_src,                                  \
-            aclTensor* acl_dst) {                                \
-            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
-        };                                                       \
-        ggml_cann_unary_op(lambda, ctx, dst);                    \
-    }                                                            \
-    while (0)
+#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                      \
+    do {                                                      \
+        ggml_tensor * src     = dst->src[0];                  \
+        aclTensor *   acl_src = ggml_cann_create_tensor(src); \
+        aclTensor *   acl_dst = ggml_cann_create_tensor(dst); \
+        GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);   \
+        ACL_CHECK(aclDestroyTensor(acl_src));                 \
+        ACL_CHECK(aclDestroyTensor(acl_dst));                 \
+    } while (0)
 #endif  // CANN_ACLNN_OPS
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cec36b36e7e92..5a1263dc79134 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -29,6 +29,8 @@
 #include <cstdio>
 #include <cstring>
 #include <mutex>
+#include <queue>
+#include <chrono>
 
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
@@ -119,9 +121,10 @@ static ggml_cann_device_info ggml_cann_init() {
         prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
         prop.location.id = id;
         prop.reserve = 0;
-        ACL_CHECK(aclrtMemGetAllocationGranularity(
+        err = aclrtMemGetAllocationGranularity(
             &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
-            &info.devices[id].vmm_granularity));
+            &info.devices[id].vmm_granularity);
+        info.devices[id].vmm = err == ACL_SUCCESS;
 
         size_t free, total;
         ggml_backend_cann_get_device_memory(id, &free, &total);
@@ -148,11 +151,223 @@ const ggml_cann_device_info& ggml_cann_info() {
 
 //#define DEBUG_CANN_MALLOC
 /**
- * @brief A pool of CANN buffers(legacy).
+ * @brief A pool of CANN buffers(priority segment buffer).
  *
  * This class manages a pool of CANN buffers for a specific device.
  */
-struct ggml_cann_pool_leg : public ggml_cann_pool {
+struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
+        /**
+         * @brief The maximum reuse margin for a buffer.
+         */
+        static const size_t max_reuse_margin = 1ull << 22;  // 4MB
+    
+        /**
+         * @brief The minimum free margin for a buffer.
+         */
+        static const size_t min_free_margin = 1ull << 20;   // 1MB
+    
+    
+        /**
+         * @brief The alignment for buffer allocation.
+         */
+       static const size_t alignment = 128;
+    
+        /**
+         * @brief The device ID associated with this buffer pool.
+         */
+        int device;
+    
+        /**
+         * @brief Whether to disable clean during buffer allocation.
+         */
+        bool disable_clean = false;
+    
+        /**
+         * @brief Structure representing a CANN buffer.
+         */
+        struct ggml_cann_buffer {
+            void* ptr = nullptr;  ///< Pointer to the buffer.
+            size_t size = 0;      ///< Size of the buffer.
+            std::chrono::steady_clock::time_point last_used;  ///< Last used time.
+    
+            bool operator>(const ggml_cann_buffer& other) const {
+                return size > other.size;
+            }
+        };
+    
+        /**
+         * @brief Array of CANN buffers in the pool.
+         */
+        std::unordered_map<void*, size_t> buffer_pool;
+        std::priority_queue<ggml_cann_buffer,
+                            std::vector<ggml_cann_buffer>,
+                            std::greater<>> free_buffers ;
+    
+        /**
+         * @brief Total size of all buffers in the pool.
+         */
+        size_t pool_size = 0;
+    
+        /**
+         * @brief Constructor to initialize the buffer pool for a specific device.
+         *
+         * @param device The device ID to associate with this buffer pool.
+         */
+        explicit ggml_cann_pool_buf_prio(int device) : device(device) {
+            disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
+        }
+    
+        /**
+         * @brief Destructor to free all buffers in the pool.
+         */
+        ~ggml_cann_pool_buf_prio() {
+            ggml_cann_set_device(device);
+            for (auto& [b_ptr, b_size] : buffer_pool) {
+                aclrtFree(b_ptr);
+               pool_size -= b_size;
+            }
+            buffer_pool.clear();
+            GGML_ASSERT(pool_size == 0);
+        }
+    
+        /**
+         * @brief Allocate a buffer of the given size.
+         *
+         * @param size The size of the buffer to allocate.
+         * @param actual_size A pointer to a variable to receive the actual size of
+         * the allocated buffer.
+         * @return A pointer to the allocated buffer.
+         */
+        void* alloc(size_t size, size_t* actual_size) override {
+            size = GGML_PAD(size, alignment);
+            if (size == 0) {
+                size = alignment;
+            }
+    
+            void* ptr = nullptr;
+            auto now = std::chrono::steady_clock::now();
+    
+            std::vector<ggml_cann_buffer> free_buffers_rest;
+            free_buffers_rest.reserve(free_buffers.size());
+            while (!free_buffers.empty()) {
+                auto b = free_buffers.top();
+                free_buffers.pop();
+    
+                if (b.size >= size) {
+                    // reuse the buffer if the size is enough
+                    const size_t margin = b.size - size;
+                    if (margin <= max_reuse_margin) {
+                        *actual_size = b.size;
+                        ptr = b.ptr;
+    #ifdef DEBUG_CANN_MALLOC
+                        GGML_LOG_INFO(
+                            "cann pool[%d]: reused   %p, "
+                            "pool_size = %5u MB, "
+                            "size = %5u MB, "
+                            "margin = %5u MB\n",
+                            device, b.ptr,
+                            (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                            (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
+                            (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+    #endif
+                        break;
+                    }
+                }
+    
+                bool should_clean = !disable_clean &&
+                                   b.size > min_free_margin &&
+                                   std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
+                if (should_clean) {
+                    // free the buffer if the size is needed to be freed
+                    ACL_CHECK(aclrtFree(b.ptr));
+                    pool_size -= b.size;
+                    buffer_pool.erase(b.ptr);
+    #ifdef DEBUG_CANN_MALLOC
+                    GGML_LOG_INFO(
+                        "cann pool[%d]: clean    %p, "
+                        "pool_size = %5u MB, "
+                        "size = %5u MB\n",
+                        device, b.ptr,
+                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+    #endif
+                    continue;
+                }
+                free_buffers_rest.push_back(b);
+            }
+            for (ggml_cann_buffer &b : free_buffers_rest) {
+                free_buffers.push(std::move(b));
+            }
+    
+    #ifdef DEBUG_CANN_MALLOC
+            GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+    #endif
+            if (ptr != nullptr) {
+                return ptr;
+            }
+    
+            // allocate a new buffer if no buffer can be reused
+            ggml_cann_set_device(device);
+            ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+            *actual_size = size;
+            pool_size += size;
+    #ifdef DEBUG_CANN_MALLOC
+            GGML_LOG_INFO(
+                "cann pool[%d]: allocate %p, "
+                "pool_size = %5u MB, "
+                "size = %5u MB\n",
+                device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
+    #endif
+            buffer_pool.emplace(ptr, size);
+            return ptr;
+        }
+    
+        /**
+         * @brief Free a buffer and return it to the pool.
+         *
+         * @param ptr Pointer to the buffer to free.
+         * @param size Size of the buffer to free.
+         */
+        void free(void* ptr, size_t size) override {
+            auto it = buffer_pool.find(ptr);
+            if (it == buffer_pool.end()) {
+                GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
+            }
+    
+            auto now = std::chrono::steady_clock::now();
+            free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
+    #ifdef DEBUG_CANN_MALLOC
+            GGML_LOG_INFO(
+                "cann pool[%d]: return   %p, "
+                "pool_size = %5u MB\n",
+                device, ptr,
+                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+    #endif
+        }
+    };
+    
+/**
+ * @brief A pool of CANN buffers(segment buffer).
+ *
+ * This class manages a pool of CANN buffers for a specific device.
+ */
+struct ggml_cann_pool_buf : public ggml_cann_pool {
+    /**
+     * @brief The maximum reuse margin for a buffer.
+     */
+    static const size_t max_reuse_margin = 1ull << 22;  // 4MB
+
+    /**
+     * @brief The minimum free margin for a buffer.
+     */
+    static const size_t min_free_margin = 1ull << 20;   // 1MB
+
+    /**
+     * @brief The alignment for buffer allocation.
+     */
+    static const size_t alignment = 128;
+
     /**
      * @brief The maximum number of buffers in the pool.
      */
@@ -163,12 +378,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      */
     int device;
 
+    /**
+     * @brief Whether to disable clean during buffer allocation.
+     */
+    bool disable_clean = false;
+
     /**
      * @brief Structure representing a CANN buffer.
      */
     struct ggml_cann_buffer {
         void* ptr = nullptr;  ///< Pointer to the buffer memory.
         size_t size = 0;      ///< Size of the buffer.
+        bool used = false;    ///< Whether the buffer is currently in use.
+        std::chrono::steady_clock::time_point last_used;  ///< Last used time.
     };
 
     /**
@@ -186,17 +408,19 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      *
      * @param device The device ID to associate with this buffer pool.
      */
-    explicit ggml_cann_pool_leg(int device) : device(device) {}
+    explicit ggml_cann_pool_buf(int device) : device(device) {
+        disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
+    }
 
     /**
      * @brief Destructor to free all buffers in the pool.
      */
-    ~ggml_cann_pool_leg() {
+    ~ggml_cann_pool_buf() {
         ggml_cann_set_device(device);
         for (int i = 0; i < MAX_BUFFERS; ++i) {
             ggml_cann_buffer& b = buffer_pool[i];
             if (b.ptr != nullptr) {
-                ACL_CHECK(aclrtFree(b.ptr));
+                aclrtFree(b.ptr);
                 pool_size -= b.size;
             }
         }
@@ -212,63 +436,93 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      * @return A pointer to the allocated buffer.
      */
     void* alloc(size_t size, size_t* actual_size) override {
-        const size_t alignment = 128;
         size = GGML_PAD(size, alignment);
         if (size == 0) {
             size = alignment;
         }
-#ifdef DEBUG_CANN_MALLOC
-        int nnz = 0;
-        size_t max_size = 0;
-#endif
-        size_t best_diff = 1ull << 36;
-        int ibest = -1;
-        for (int i = 0; i < MAX_BUFFERS; ++i) {
+
+        void* ptr = nullptr;
+        auto now = std::chrono::steady_clock::now();
+
+        int i = 0;
+        for (; i < MAX_BUFFERS; ++i) {
             ggml_cann_buffer& b = buffer_pool[i];
-            if (b.ptr != nullptr) {
+            if (b.ptr == nullptr) {
+                break;
+            }
+            if (b.used) {
+                continue;
+            }
+            if (b.size >= size) {
+                // reuse the buffer if the size is enough
+                const size_t margin = b.size - size;
+                if (margin <= max_reuse_margin) {
+                    *actual_size = b.size;
+                    b.used = true;
+                    ptr = b.ptr;
 #ifdef DEBUG_CANN_MALLOC
-                ++nnz;
-                if (b.size > max_size) max_size = b.size;
+                    GGML_LOG_INFO(
+                        "cann pool[%d]: reused   %p, "
+                        "pool_size = %5u MB, "
+                        "size = %5u MB, "
+                        "margin = %5u MB\n",
+                        device, b.ptr,
+                        (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                        (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
+                        (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
 #endif
-                if (b.size >= size) {
-                    size_t diff = b.size - size;
-                    if (diff < best_diff) {
-                        best_diff = diff;
-                        ibest = i;
-                        if (!best_diff) {
-                            void* ptr = b.ptr;
-                            *actual_size = b.size;
-                            b.ptr = nullptr;
-                            b.size = 0;
-                            return ptr;
-                        }
-                    }
+                    break;
                 }
             }
+
+            bool should_clean = !disable_clean &&
+                                b.size > min_free_margin &&
+                                std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
+            if (should_clean) {
+                // free the buffer if the size is needed to be freed
+                ACL_CHECK(aclrtFree(b.ptr));
+                pool_size -= b.size;
+#ifdef DEBUG_CANN_MALLOC
+                GGML_LOG_INFO(
+                    "cann pool[%d]: clean    %p, "
+                    "pool_size = %5u MB, "
+                    "size = %5u MB\n",
+                    device, b.ptr,
+                    (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                    (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+#endif
+                b.ptr = nullptr;
+            }
         }
-        if (ibest >= 0) {
-            ggml_cann_buffer& b = buffer_pool[ibest];
-            void* ptr = b.ptr;
-            *actual_size = b.size;
-            b.ptr = nullptr;
-            b.size = 0;
+        if (ptr != nullptr) {
             return ptr;
         }
-        void* ptr;
-        ggml_cann_set_device(device);
-        ACL_CHECK(
-            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = size;
-        pool_size += size;
+
+        if (i < MAX_BUFFERS) {
+            // allocate a new buffer if no buffer can be reused
+            ggml_cann_buffer& b = buffer_pool[i];
+            ggml_cann_set_device(device);
+            ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+            pool_size += size;
+            *actual_size = size;
+            b.size = size;
+            b.used = true;
+            if (i >= MAX_BUFFERS - 8) {
+                GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
+            }
 #ifdef DEBUG_CANN_MALLOC
-        GGML_LOG_INFO(
-            "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
-            "requested %u MB\n",
-            __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
-            (uint32_t)(pool_size / 1024 / 1024),
-            (uint32_t)(size / 1024 / 1024));
+            GGML_LOG_INFO(
+                "cann pool[%d]: allocate %p, "
+                "pool_size = %5u MB, "
+                "size = %5u MB\n",
+                device, b.ptr,
+                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
+                (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
 #endif
-        return ptr;
+            return b.ptr;
+        }
+
+        GGML_ABORT("cann pool[%d]: slots full\n", device);
     }
 
     /**
@@ -280,16 +534,21 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
     void free(void* ptr, size_t size) override {
         for (int i = 0; i < MAX_BUFFERS; ++i) {
             ggml_cann_buffer& b = buffer_pool[i];
-            if (b.ptr == nullptr) {
-                b.ptr = ptr;
-                b.size = size;
-                return;
+            if (b.ptr != ptr) {
+                continue;
             }
+            b.used = false;
+            b.last_used = std::chrono::steady_clock::now();
+#ifdef DEBUG_CANN_MALLOC
+            GGML_LOG_INFO(
+                "cann pool[%d]: return   %p, "
+                "pool_size = %5u MB\n",
+                device, b.ptr,
+                (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+#endif
+            return;
         }
-        // memory should always buffered. these memory may still needed by
-        // tasks in stream.
-        // TODO, fix me.
-        GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
+        GGML_ABORT("cann pool[%d]: slots full\n", device);
     }
 };
 
@@ -347,8 +606,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      * @param device The device ID to associate with this buffer pool.
      */
     explicit ggml_cann_pool_vmm(int device)
-        : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+    : device(device) {
         auto dev = ggml_cann_info().devices[device];
         granularity = dev.vmm_granularity;
         max_size = dev.total_vram;
@@ -471,7 +729,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+    bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
+    if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
+        GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
+        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+    }
+    bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
+    if (enable_buf_prio) {
+        GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
+        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
+    }
+    GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
+    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
 }
 
 // cann buffer
@@ -1020,8 +1289,11 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
 
     ggml_cann_set_device(buft_ctx->device);
 
-    size = std::max(size, (size_t)1);
-
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
     void* dev_ptr;
     aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
     if (err != ACL_SUCCESS) {
@@ -1330,12 +1602,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                     GGML_CANN_CALL_UNARY_OP(Silu);
                     break;
                 case GGML_UNARY_OP_GELU_QUICK: {
-                    auto lambda = [](ggml_backend_cann_context& ctx,
-                        aclTensor* acl_src,
-                        aclTensor* acl_dst) {
-                        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-                    };
-                    ggml_cann_unary_op(lambda, ctx, dst);
+                    ggml_cann_unary_op<aclnn_geluv2>(ctx, dst);
                 } break;
                 case GGML_UNARY_OP_TANH:
                     GGML_CANN_CALL_UNARY_OP(Tanh);

From cc36575fbc8e4b31507d44b3ca402aa94fc4ae9a Mon Sep 17 00:00:00 2001
From: dou <15529241576@163.com>
Date: Fri, 11 Apr 2025 09:27:34 +0000
Subject: [PATCH 2/3] update some changes

---
 ggml/src/ggml-cann/aclnn_ops.cpp |  5 -----
 ggml/src/ggml-cann/aclnn_ops.h   | 22 ++++++++++------------
 ggml/src/ggml-cann/ggml-cann.cpp |  7 ++++++-
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 4357c9a2948ec..f312a620cb69c 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1209,11 +1209,6 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     GGML_CANN_CALL_ACLNN_OP(Sin, acl_src, acl_dst);
 }
 
-void aclnn_geluv2(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                       aclTensor* acl_dst) {
-        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
-}
-
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     const ggml_tensor* src = dst->src[0];
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index 2669debe43dc1..b2d1b3c36d238 100644
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -593,9 +593,6 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     aclTensor* acl_dst);
 
-void aclnn_geluv2(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-    aclTensor* acl_dst);
-
 /**
  * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
  * output tensor.
@@ -843,13 +840,14 @@ void ggml_cann_unary_op(
  * @see ggml_cann_unary_op
  * @see GGML_CANN_CALL_ACLNN_OP
  */
-#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                      \
-    do {                                                      \
-        ggml_tensor * src     = dst->src[0];                  \
-        aclTensor *   acl_src = ggml_cann_create_tensor(src); \
-        aclTensor *   acl_dst = ggml_cann_create_tensor(dst); \
-        GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);   \
-        ACL_CHECK(aclDestroyTensor(acl_src));                 \
-        ACL_CHECK(aclDestroyTensor(acl_dst));                 \
-    } while (0)
+#define GGML_CANN_CALL_UNARY_OP(OP_NAME)                         \
+    do {                                                         \
+        auto lambda = [](ggml_backend_cann_context& ctx,         \
+            aclTensor* acl_src,                                  \
+            aclTensor* acl_dst) {                                \
+            GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);  \
+        };                                                       \
+        ggml_cann_unary_op(lambda, ctx, dst);                    \
+    }                                                            \
+    while (0)
 #endif  // CANN_ACLNN_OPS
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 5a1263dc79134..4f80bbd062e1e 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1602,7 +1602,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
                     GGML_CANN_CALL_UNARY_OP(Silu);
                     break;
                 case GGML_UNARY_OP_GELU_QUICK: {
-                    ggml_cann_unary_op<aclnn_geluv2>(ctx, dst);
+                    auto lambda = [](ggml_backend_cann_context& ctx,
+                        aclTensor* acl_src,
+                        aclTensor* acl_dst) {
+                        GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
+                    };
+                    ggml_cann_unary_op(lambda, ctx, dst);
                 } break;
                 case GGML_UNARY_OP_TANH:
                     GGML_CANN_CALL_UNARY_OP(Tanh);

From c21bc52f585e73b6f5d636c03fd3323d8d762704 Mon Sep 17 00:00:00 2001
From: dou <15529241576@163.com>
Date: Mon, 14 Apr 2025 12:07:18 +0000
Subject: [PATCH 3/3] fix some errors

---
 ggml/src/ggml-cann/ggml-cann.cpp | 43 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 4f80bbd062e1e..8f8acaf999cb1 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -160,28 +160,27 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
          * @brief The maximum reuse margin for a buffer.
          */
         static const size_t max_reuse_margin = 1ull << 22;  // 4MB
-    
+
         /**
          * @brief The minimum free margin for a buffer.
          */
         static const size_t min_free_margin = 1ull << 20;   // 1MB
-    
-    
+
         /**
          * @brief The alignment for buffer allocation.
          */
-       static const size_t alignment = 128;
-    
+        static const size_t alignment = 128;
+
         /**
          * @brief The device ID associated with this buffer pool.
          */
         int device;
-    
+
         /**
          * @brief Whether to disable clean during buffer allocation.
          */
         bool disable_clean = false;
-    
+
         /**
          * @brief Structure representing a CANN buffer.
          */
@@ -189,12 +188,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             void* ptr = nullptr;  ///< Pointer to the buffer.
             size_t size = 0;      ///< Size of the buffer.
             std::chrono::steady_clock::time_point last_used;  ///< Last used time.
-    
+
             bool operator>(const ggml_cann_buffer& other) const {
                 return size > other.size;
             }
         };
-    
+
         /**
          * @brief Array of CANN buffers in the pool.
          */
@@ -202,12 +201,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
         std::priority_queue<ggml_cann_buffer,
                             std::vector<ggml_cann_buffer>,
                             std::greater<>> free_buffers ;
-    
+
         /**
          * @brief Total size of all buffers in the pool.
          */
         size_t pool_size = 0;
-    
+
         /**
          * @brief Constructor to initialize the buffer pool for a specific device.
          *
@@ -216,7 +215,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
         explicit ggml_cann_pool_buf_prio(int device) : device(device) {
             disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
         }
-    
+
         /**
          * @brief Destructor to free all buffers in the pool.
          */
@@ -229,7 +228,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             buffer_pool.clear();
             GGML_ASSERT(pool_size == 0);
         }
-    
+
         /**
          * @brief Allocate a buffer of the given size.
          *
@@ -243,16 +242,16 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             if (size == 0) {
                 size = alignment;
             }
-    
+
             void* ptr = nullptr;
             auto now = std::chrono::steady_clock::now();
-    
+
             std::vector<ggml_cann_buffer> free_buffers_rest;
             free_buffers_rest.reserve(free_buffers.size());
             while (!free_buffers.empty()) {
                 auto b = free_buffers.top();
                 free_buffers.pop();
-    
+
                 if (b.size >= size) {
                     // reuse the buffer if the size is enough
                     const size_t margin = b.size - size;
@@ -273,7 +272,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
                         break;
                     }
                 }
-    
+
                 bool should_clean = !disable_clean &&
                                    b.size > min_free_margin &&
                                    std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
@@ -298,14 +297,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             for (ggml_cann_buffer &b : free_buffers_rest) {
                 free_buffers.push(std::move(b));
             }
-    
+
     #ifdef DEBUG_CANN_MALLOC
             GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
     #endif
             if (ptr != nullptr) {
                 return ptr;
             }
-    
+
             // allocate a new buffer if no buffer can be reused
             ggml_cann_set_device(device);
             ACL_CHECK(aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
@@ -322,7 +321,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             buffer_pool.emplace(ptr, size);
             return ptr;
         }
-    
+
         /**
          * @brief Free a buffer and return it to the pool.
          *
@@ -334,7 +333,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
             if (it == buffer_pool.end()) {
                 GGML_ABORT("cann pool[%d]: buffer %p not found in pool\n", device, ptr);
             }
-    
+
             auto now = std::chrono::steady_clock::now();
             free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
     #ifdef DEBUG_CANN_MALLOC
@@ -346,7 +345,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
     #endif
         }
     };
-    
+
 /**
  * @brief A pool of CANN buffers(segment buffer).
  *