From 8bb6f764e108d6846028b42576c85216eeb3741d Mon Sep 17 00:00:00 2001 From: morris Date: Fri, 17 Jan 2025 16:32:38 +0800 Subject: [PATCH 1/2] refactor(dma): split rx buffer to cache aligned ones --- components/esp_hw_support/dma/esp_dma_utils.c | 127 +++++++++++------- components/esp_hw_support/dma/gdma_link.c | 6 - .../dma/include/esp_private/esp_dma_utils.h | 31 +++-- .../test_apps/dma/main/test_gdma.c | 53 +++----- 4 files changed, 117 insertions(+), 100 deletions(-) diff --git a/components/esp_hw_support/dma/esp_dma_utils.c b/components/esp_hw_support/dma/esp_dma_utils.c index 75c739926f9e..875004198112 100644 --- a/components/esp_hw_support/dma/esp_dma_utils.c +++ b/components/esp_hw_support/dma/esp_dma_utils.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2023-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -24,68 +24,101 @@ static const char *TAG = "dma_utils"; #define ALIGN_UP_BY(num, align) (((num) + ((align) - 1)) & ~((align) - 1)) -#define ALIGN_DOWN_BY(num, align) ((num) & (~((align) - 1))) -esp_err_t esp_dma_split_buffer_to_aligned(void *input_buffer, size_t input_buffer_len, void *stash_buffer, size_t stash_buffer_len, size_t split_alignment, dma_buffer_split_array_t *align_array) +esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer) { - esp_err_t ret = ESP_OK; - ESP_RETURN_ON_FALSE(align_array && input_buffer && input_buffer_len && stash_buffer && split_alignment && !(split_alignment & (split_alignment - 1) - && (stash_buffer_len >= 2 * split_alignment)), ESP_ERR_INVALID_ARG, TAG, "invalid argument"); - ESP_RETURN_ON_FALSE(!((uintptr_t)stash_buffer % split_alignment), ESP_ERR_INVALID_ARG, TAG, "extra buffer is not aligned"); - - // calculate head_overflow_len - size_t head_overflow_len = (uintptr_t)input_buffer % split_alignment; - head_overflow_len = head_overflow_len ? split_alignment - head_overflow_len : 0; - ESP_LOGD(TAG, "head_addr:%p split_alignment:%zu head_overflow_len:%zu", input_buffer, split_alignment, head_overflow_len); - // calculate tail_overflow_len - size_t tail_overflow_len = ((uintptr_t)input_buffer + input_buffer_len) % split_alignment; - ESP_LOGD(TAG, "tail_addr:%p split_alignment:%zu tail_overflow_len:%zu", input_buffer + input_buffer_len - tail_overflow_len, split_alignment, tail_overflow_len); - - uint32_t extra_buf_count = 0; - input_buffer = (uint8_t*)input_buffer; - stash_buffer = (uint8_t*)stash_buffer; - align_array->buf.head.recovery_address = input_buffer; - align_array->buf.head.aligned_buffer = stash_buffer + split_alignment * extra_buf_count++; - align_array->buf.head.length = head_overflow_len; - align_array->buf.body.recovery_address = input_buffer + head_overflow_len; - align_array->buf.body.aligned_buffer = input_buffer + head_overflow_len; - align_array->buf.body.length = input_buffer_len - head_overflow_len - tail_overflow_len; - align_array->buf.tail.recovery_address = input_buffer + input_buffer_len - tail_overflow_len; - align_array->buf.tail.aligned_buffer = stash_buffer + split_alignment * extra_buf_count++; - align_array->buf.tail.length = tail_overflow_len; - - // special handling when input_buffer length is no more than buffer alignment - if(head_overflow_len >= input_buffer_len || tail_overflow_len >= input_buffer_len) - { - align_array->buf.head.length = input_buffer_len ; - align_array->buf.body.length = 0 ; - align_array->buf.tail.length = 0 ; + ESP_RETURN_ON_FALSE(rx_buffer && buffer_len && align_buf_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); + + // read the cache line size of internal and external memory, we also use this information to check if a given memory is behind the cache + size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA); + + size_t split_line_size = 0; + if (esp_ptr_external_ram(rx_buffer)) { + split_line_size = ext_mem_cache_line_size; + } else if (esp_ptr_internal(rx_buffer)) { + split_line_size = int_mem_cache_line_size; + } + ESP_LOGV(TAG, "split_line_size:%zu", split_line_size); + + // allocate the stash buffer from internal RAM + // Note, the split_line_size can be 0, in this case, the stash_buffer is also NULL, which is fine + uint8_t* stash_buffer = heap_caps_calloc(2, split_line_size, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + ESP_RETURN_ON_FALSE(!(split_line_size && !stash_buffer), ESP_ERR_NO_MEM, TAG, "no mem for stash buffer"); + + // clear align_array to avoid garbage data + memset(align_buf_array, 0, sizeof(dma_buffer_split_array_t)); + bool need_cache_sync[3] = {false}; + + // if split_line_size is non-zero, split the buffer into head, body and tail + if (split_line_size > 0) { + // calculate head_overflow_len + size_t head_overflow_len = (uintptr_t)rx_buffer % split_line_size; + head_overflow_len = head_overflow_len ? split_line_size - head_overflow_len : 0; + ESP_LOGV(TAG, "head_addr:%p head_overflow_len:%zu", rx_buffer, head_overflow_len); + // calculate tail_overflow_len + size_t tail_overflow_len = ((uintptr_t)rx_buffer + buffer_len) % split_line_size; + ESP_LOGV(TAG, "tail_addr:%p tail_overflow_len:%zu", rx_buffer + buffer_len - tail_overflow_len, tail_overflow_len); + + uint8_t extra_buf_count = 0; + uint8_t* input_buffer = (uint8_t*)rx_buffer; + align_buf_array->buf.head.recovery_address = input_buffer; + align_buf_array->buf.head.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++; + align_buf_array->buf.head.length = head_overflow_len; + need_cache_sync[0] = int_mem_cache_line_size > 0; + align_buf_array->buf.body.recovery_address = input_buffer + head_overflow_len; + align_buf_array->buf.body.aligned_buffer = input_buffer + head_overflow_len; + align_buf_array->buf.body.length = buffer_len - head_overflow_len - tail_overflow_len; + need_cache_sync[1] = true; + align_buf_array->buf.tail.recovery_address = input_buffer + buffer_len - tail_overflow_len; + align_buf_array->buf.tail.aligned_buffer = stash_buffer + split_line_size * extra_buf_count++; + align_buf_array->buf.tail.length = tail_overflow_len; + need_cache_sync[2] = int_mem_cache_line_size > 0; + + // special handling when input_buffer length is no more than buffer alignment + if (head_overflow_len >= buffer_len || tail_overflow_len >= buffer_len) { + align_buf_array->buf.head.length = buffer_len ; + align_buf_array->buf.body.length = 0 ; + align_buf_array->buf.tail.length = 0 ; + } + } else { + align_buf_array->buf.body.aligned_buffer = rx_buffer; + align_buf_array->buf.body.recovery_address = rx_buffer; + align_buf_array->buf.body.length = buffer_len; + need_cache_sync[1] = false; } - for(int i = 0; i < 3; i++) { - if(!align_array->aligned_buffer[i].length) { - align_array->aligned_buffer[i].aligned_buffer = NULL; - align_array->aligned_buffer[i].recovery_address = NULL; + for (int i = 0; i < 3; i++) { + if (align_buf_array->aligned_buffer[i].length == 0) { + align_buf_array->aligned_buffer[i].aligned_buffer = NULL; + align_buf_array->aligned_buffer[i].recovery_address = NULL; + need_cache_sync[i] = false; } } - return ret; + // invalidate the aligned buffer if necessary + for (int i = 0; i < 3; i++) { + if (need_cache_sync[i]) { + esp_cache_msync(align_buf_array->aligned_buffer[i].aligned_buffer, align_buf_array->aligned_buffer[i].length, ESP_CACHE_MSYNC_FLAG_DIR_M2C); + } + } + + *ret_stash_buffer = stash_buffer; + return ESP_OK; } -esp_err_t esp_dma_merge_aligned_buffers(dma_buffer_split_array_t *align_array) +esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_array) { - esp_err_t ret = ESP_OK; - ESP_RETURN_ON_FALSE(align_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); + ESP_RETURN_ON_FALSE_ISR(align_array, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); // only need to copy the head and tail buffer - if(align_array->buf.head.length) { + if (align_array->buf.head.length) { memcpy(align_array->buf.head.recovery_address, align_array->buf.head.aligned_buffer, align_array->buf.head.length); } - if(align_array->buf.tail.length) { + if (align_array->buf.tail.length) { memcpy(align_array->buf.tail.recovery_address, align_array->buf.tail.aligned_buffer, align_array->buf.tail.length); } - - return ret; + return ESP_OK; } esp_err_t esp_dma_capable_malloc(size_t size, const esp_dma_mem_info_t *dma_mem_info, void **out_ptr, size_t *actual_size) diff --git a/components/esp_hw_support/dma/gdma_link.c b/components/esp_hw_support/dma/gdma_link.c index 5381b150e152..6b141e9cdae1 100644 --- a/components/esp_hw_support/dma/gdma_link.c +++ b/components/esp_hw_support/dma/gdma_link.c @@ -6,14 +6,8 @@ #include #include -#include #include -#include -#include "sdkconfig.h" -#include "freertos/FreeRTOS.h" -#include "freertos/task.h" #include "soc/soc_caps.h" -#include "soc/ext_mem_defs.h" #include "esp_log.h" #include "esp_check.h" #include "esp_memory_utils.h" diff --git a/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h b/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h index b9ed67e93e41..ac89c287f4ee 100644 --- a/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h +++ b/components/esp_hw_support/dma/include/esp_private/esp_dma_utils.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2023-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2023-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -24,6 +24,8 @@ typedef struct { /** * @brief DMA buffer aligned array + * The array contains three parts: head, body and tail. + * Length of each part will be >=0, especially, length=0 means that there is no such part. */ typedef struct { union { @@ -37,22 +39,21 @@ typedef struct { } dma_buffer_split_array_t; /** - * @brief Split unaligned DMA buffer to aligned DMA buffer or aligned DMA buffer array + * @brief Split DMA RX buffer to cache aligned buffers * - * @note Returned align array contains three parts: head, body and tail. Length of each buffer will be >=0, length 0 means that there is no such part + * @note After the original RX buffer is split into an array, caller should mount the buffer array to the DMA controller in scatter-gather mode. + * Don't read/write the aligned buffers before the DMA finished using them. * - * @param[in] buffer Origin DMA buffer address - * @param[in] buffer_len Origin DMA buffer length - * @param[in] stash_buffer Needed extra buffer to stash aligned buffer, should be allocated with DMA capable memory and aligned to split_alignment - * @param[in] stash_buffer_len stash_buffer length - * @param[in] split_alignment Alignment of each buffer required by the DMA - * @param[out] align_array Aligned DMA buffer array + * @param[in] rx_buffer The origin DMA buffer used for receiving data + * @param[in] buffer_len rx_buffer length + * @param[out] align_buf_array Aligned DMA buffer array + * @param[out] ret_stash_buffer Allocated stash buffer (caller should free it after use) * @return * - ESP_OK: Split to aligned buffer successfully * - ESP_ERR_INVALID_ARG: Split to aligned buffer failed because of invalid argument * * brief sketch: - * buffer alignment delimiter buffer alignment delimiter + * cache alignment delimiter cache alignment delimiter * │ │ * Origin Buffer │ Origin Buffer │ * │ │ │ │ @@ -68,17 +69,19 @@ typedef struct { * ▼ ▼ * |xxxxx......| |xxxxx......| */ -esp_err_t esp_dma_split_buffer_to_aligned(void *buffer, size_t buffer_len, void *stash_buffer, size_t stash_buffer_len, size_t split_alignment, dma_buffer_split_array_t *align_array); +esp_err_t esp_dma_split_rx_buffer_to_cache_aligned(void *rx_buffer, size_t buffer_len, dma_buffer_split_array_t *align_buf_array, uint8_t** ret_stash_buffer); /** - * @brief Merge aligned buffer array to origin buffer + * @brief Merge aligned RX buffer array to origin buffer * - * @param[in] align_array Aligned DMA buffer array + * @note This function can be used in the ISR context. + * + * @param[in] align_buf_array Aligned DMA buffer array * @return * - ESP_OK: Merge aligned buffer to origin buffer successfully * - ESP_ERR_INVALID_ARG: Merge aligned buffer to origin buffer failed because of invalid argument */ -esp_err_t esp_dma_merge_aligned_buffers(dma_buffer_split_array_t *align_array); +esp_err_t esp_dma_merge_aligned_rx_buffers(dma_buffer_split_array_t *align_buf_array); #ifdef __cplusplus } diff --git a/components/esp_hw_support/test_apps/dma/main/test_gdma.c b/components/esp_hw_support/test_apps/dma/main/test_gdma.c index ed3d8755a5b4..39e89a6cc465 100644 --- a/components/esp_hw_support/test_apps/dma/main/test_gdma.c +++ b/components/esp_hw_support/test_apps/dma/main/test_gdma.c @@ -395,25 +395,18 @@ TEST_CASE("GDMA M2M Mode", "[GDMA][M2M]") typedef struct { SemaphoreHandle_t done_sem; dma_buffer_split_array_t *align_array; - size_t split_alignment; - bool need_invalidate; } test_gdma_context_t; -static bool test_gdma_m2m_unalgined_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data) +static bool test_gdma_m2m_unaligned_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_data_t *event_data, void *user_data) { BaseType_t task_woken = pdFALSE; test_gdma_context_t *user_ctx = (test_gdma_context_t*)user_data; - for (int i = 0; i < 3; i++) { - if (user_ctx->align_array->aligned_buffer[i].aligned_buffer && user_ctx->need_invalidate) { - TEST_ESP_OK(esp_cache_msync(user_ctx->align_array->aligned_buffer[i].aligned_buffer, ALIGN_UP(user_ctx->align_array->aligned_buffer[i].length, user_ctx->split_alignment), ESP_CACHE_MSYNC_FLAG_DIR_M2C)); - } - } - TEST_ESP_OK(esp_dma_merge_aligned_buffers(user_ctx->align_array)); + TEST_ESP_OK(esp_dma_merge_aligned_rx_buffers(user_ctx->align_array)); xSemaphoreGiveFromISR(user_ctx->done_sem, &task_woken); return task_woken == pdTRUE; } -static void test_gdma_m2m_unalgined_buffer_test(uint8_t *dst_data, uint8_t *src_data, size_t data_length, size_t offset_len, size_t split_alignment) +static void test_gdma_m2m_unaligned_buffer_test(uint8_t *dst_data, uint8_t *src_data, size_t data_length, size_t offset_len) { TEST_ASSERT_NOT_NULL(src_data); TEST_ASSERT_NOT_NULL(dst_data); @@ -458,13 +451,10 @@ static void test_gdma_m2m_unalgined_buffer_test(uint8_t *dst_data, uint8_t *src_ }; TEST_ESP_OK(gdma_link_mount_buffers(tx_link_list, 0, tx_buf_mount_config, sizeof(tx_buf_mount_config) / sizeof(gdma_buffer_mount_config_t), NULL)); - // allocate stash_buffer, should be freed by the user - void *stash_buffer = heap_caps_aligned_calloc(split_alignment, 2, split_alignment, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - size_t stash_buffer_len = 2 * split_alignment; dma_buffer_split_array_t align_array = {0}; gdma_buffer_mount_config_t rx_aligned_buf_mount_config[3] = {0}; - - TEST_ESP_OK(esp_dma_split_buffer_to_aligned(dst_data + offset_len, data_length, stash_buffer, stash_buffer_len, split_alignment, &align_array)); + uint8_t* stash_buffer = NULL; + TEST_ESP_OK(esp_dma_split_rx_buffer_to_cache_aligned(dst_data + offset_len, data_length, &align_array, &stash_buffer)); for (int i = 0; i < 3; i++) { rx_aligned_buf_mount_config[i].buffer = align_array.aligned_buffer[i].aligned_buffer; rx_aligned_buf_mount_config[i].length = align_array.aligned_buffer[i].length; @@ -472,15 +462,13 @@ static void test_gdma_m2m_unalgined_buffer_test(uint8_t *dst_data, uint8_t *src_ TEST_ESP_OK(gdma_link_mount_buffers(rx_link_list, 0, rx_aligned_buf_mount_config, 3, NULL)); gdma_rx_event_callbacks_t rx_cbs = { - .on_recv_eof = test_gdma_m2m_unalgined_rx_eof_callback, + .on_recv_eof = test_gdma_m2m_unaligned_rx_eof_callback, }; SemaphoreHandle_t done_sem = xSemaphoreCreateBinary(); TEST_ASSERT_NOT_NULL(done_sem); test_gdma_context_t user_ctx = { .done_sem = done_sem, .align_array = &align_array, - .split_alignment = split_alignment, - .need_invalidate = sram_alignment ? true : false, }; TEST_ESP_OK(gdma_register_rx_event_callbacks(rx_chan, &rx_cbs, &user_ctx)); @@ -494,12 +482,12 @@ static void test_gdma_m2m_unalgined_buffer_test(uint8_t *dst_data, uint8_t *src_ TEST_ASSERT_EQUAL(i % 256, dst_data[i + offset_len]); } - free(stash_buffer); TEST_ESP_OK(gdma_del_link_list(tx_link_list)); TEST_ESP_OK(gdma_del_link_list(rx_link_list)); TEST_ESP_OK(gdma_del_channel(tx_chan)); TEST_ESP_OK(gdma_del_channel(rx_chan)); vSemaphoreDelete(done_sem); + free(stash_buffer); } TEST_CASE("GDMA M2M Unaligned RX Buffer Test", "[GDMA][M2M]") @@ -507,29 +495,28 @@ TEST_CASE("GDMA M2M Unaligned RX Buffer Test", "[GDMA][M2M]") uint8_t *sbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); uint8_t *dbuf = heap_caps_aligned_calloc(64, 1, 10240, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - size_t split_alignment = 64; // case buffer len less than buffer alignment - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 60, 0, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 60, 4, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 60, 2, split_alignment); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 4); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 60, 2); // case buffer head aligned - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 246, 0, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 8182, 0, split_alignment); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 0); // case buffer tail aligned - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 246, 10, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 8182, 10, split_alignment); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 246, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8182, 10); // case buffer unaligned - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 100, 10, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 10, 60, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 256, 10, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 8192, 10, split_alignment); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 100, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 10, 60); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 10); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 10); // case buffer full aligned - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 256, 0, split_alignment); - test_gdma_m2m_unalgined_buffer_test(dbuf, sbuf, 8192, 0, split_alignment); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 256, 0); + test_gdma_m2m_unaligned_buffer_test(dbuf, sbuf, 8192, 0); free(sbuf); free(dbuf); From 0c7fef8ac9b715eabf4e906cb55c206473dbb552 Mon Sep 17 00:00:00 2001 From: morris Date: Wed, 27 Nov 2024 19:04:13 +0800 Subject: [PATCH 2/2] feat(async_memcpy): support rx buffer unaligned to cache line size --- .../esp_hw_support/dma/async_memcpy_gdma.c | 327 ++++++------- .../dma/esp_async_memcpy_priv.h | 2 - .../test_apps/dma/main/test_async_memcpy.c | 436 ++++++++---------- .../test_apps/dma/sdkconfig.defaults.esp32c5 | 2 + .../esp32c5/include/soc/Kconfig.soc_caps.in | 4 + components/soc/esp32c5/include/soc/soc_caps.h | 1 + 6 files changed, 353 insertions(+), 419 deletions(-) create mode 100644 components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32c5 diff --git a/components/esp_hw_support/dma/async_memcpy_gdma.c b/components/esp_hw_support/dma/async_memcpy_gdma.c index b2a2db7574b5..b81ce3f1425a 100644 --- a/components/esp_hw_support/dma/async_memcpy_gdma.c +++ b/components/esp_hw_support/dma/async_memcpy_gdma.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2020-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2020-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -16,69 +16,49 @@ #include "esp_attr.h" #include "esp_err.h" #include "esp_private/gdma.h" +#include "esp_private/gdma_link.h" +#include "esp_private/esp_dma_utils.h" #include "esp_memory_utils.h" +#include "esp_cache.h" #include "esp_async_memcpy.h" #include "esp_async_memcpy_priv.h" -#include "esp_cache.h" -#include "hal/dma_types.h" #include "hal/cache_hal.h" #include "hal/cache_ll.h" +#include "hal/gdma_ll.h" static const char *TAG = "async_mcp.gdma"; -#ifdef CACHE_LL_L2MEM_NON_CACHE_ADDR -#define MCP_GET_NON_CACHE_ADDR(addr) ((addr) ? CACHE_LL_L2MEM_NON_CACHE_ADDR(addr) : 0) -#else -#define MCP_GET_NON_CACHE_ADDR(addr) (addr) -#endif - -#if SOC_AXI_GDMA_SUPPORTED -#define MCP_DMA_DESC_ALIGN 8 -typedef dma_descriptor_align8_t mcp_dma_descriptor_t; -#elif SOC_AHB_GDMA_SUPPORTED -#define MCP_DMA_DESC_ALIGN 4 -typedef dma_descriptor_align4_t mcp_dma_descriptor_t; -#else -#error "Unsupported GDMA type" -#endif +#define MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE 4095 /// @brief Transaction object for async memcpy -/// @note - GDMA requires the DMA descriptors to be 4 or 8 bytes aligned -/// @note - The DMA descriptor link list is allocated dynamically from DMA-able memory -/// @note - Because of the eof_node, the transaction object should also be allocated from DMA-able memory typedef struct async_memcpy_transaction_t { - mcp_dma_descriptor_t eof_node; // this is the DMA node which act as the EOF descriptor (RX path only) - mcp_dma_descriptor_t *tx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size - mcp_dma_descriptor_t *tx_desc_nc; // non-cacheable version of tx_desc_link - mcp_dma_descriptor_t *rx_desc_link; // descriptor link list, the length of the link is determined by the copy buffer size - mcp_dma_descriptor_t *rx_desc_nc; // non-cacheable version of rx_desc_link - intptr_t tx_start_desc_addr; // TX start descriptor address - intptr_t rx_start_desc_addr; // RX start descriptor address - void *memcpy_dst_addr; // memcpy destination address - size_t memcpy_size; // memcpy size - async_memcpy_isr_cb_t cb; // user callback - void *cb_args; // user callback args + gdma_link_list_handle_t tx_link_list; // DMA link list for TX direction + gdma_link_list_handle_t rx_link_list; // DMA link list for RX direction + dma_buffer_split_array_t rx_buf_array; // Split the destination buffer into cache aligned ones, save the splits in this array + uint8_t* stash_buffer; // Stash buffer for cache aligned buffer + async_memcpy_isr_cb_t cb; // user callback + void *cb_args; // user callback args STAILQ_ENTRY(async_memcpy_transaction_t) idle_queue_entry; // Entry for the idle queue STAILQ_ENTRY(async_memcpy_transaction_t) ready_queue_entry; // Entry for the ready queue } async_memcpy_transaction_t; /// @brief Context of async memcpy driver /// @note - It saves two queues, one for idle transaction objects, one for ready transaction objects -/// @note - Transaction objects are allocated from DMA-able memory /// @note - Number of transaction objects are determined by the backlog parameter typedef struct { async_memcpy_context_t parent; // Parent IO interface - size_t rx_int_mem_alignment; // DMA buffer alignment (both in size and address) for internal RX memory - size_t rx_ext_mem_alignment; // DMA buffer alignment (both in size and address) for external RX memory - size_t tx_int_mem_alignment; // DMA buffer alignment (both in size and address) for internal TX memory - size_t tx_ext_mem_alignment; // DMA buffer alignment (both in size and address) for external TX memory - size_t max_single_dma_buffer; // max DMA buffer size by a single descriptor + size_t rx_int_mem_alignment; // Required DMA buffer alignment for internal RX memory + size_t rx_ext_mem_alignment; // Required DMA buffer alignment for external RX memory + size_t tx_int_mem_alignment; // Required DMA buffer alignment for internal TX memory + size_t tx_ext_mem_alignment; // Required DMA buffer alignment for external TX memory int gdma_bus_id; // GDMA bus id (AHB, AXI, etc.) gdma_channel_handle_t tx_channel; // GDMA TX channel handle gdma_channel_handle_t rx_channel; // GDMA RX channel handle portMUX_TYPE spin_lock; // spin lock to avoid threads and isr from accessing the same resource simultaneously _Atomic async_memcpy_fsm_t fsm; // driver state machine, changing state should be atomic - async_memcpy_transaction_t *transaction_pool; // transaction object pool + size_t num_trans_objs; // number of transaction objects + async_memcpy_transaction_t *transaction_pool; // transaction object pool + async_memcpy_transaction_t *current_transaction; // current transaction object STAILQ_HEAD(, async_memcpy_transaction_t) idle_queue_head; // Head of the idle queue STAILQ_HEAD(, async_memcpy_transaction_t) ready_queue_head; // Head of the ready queue } async_memcpy_gdma_context_t; @@ -92,9 +72,23 @@ static esp_err_t mcp_new_etm_event(async_memcpy_context_t *ctx, async_memcpy_etm static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma) { + // clean up transaction pool if (mcp_gdma->transaction_pool) { + for (size_t i = 0; i < mcp_gdma->num_trans_objs; i++) { + async_memcpy_transaction_t* trans = &mcp_gdma->transaction_pool[i]; + if (trans->tx_link_list) { + gdma_del_link_list(trans->tx_link_list); + } + if (trans->rx_link_list) { + gdma_del_link_list(trans->rx_link_list); + } + if (trans->stash_buffer) { + free(trans->stash_buffer); + } + } free(mcp_gdma->transaction_pool); } + // clean up GDMA channels if (mcp_gdma->tx_channel) { gdma_disconnect(mcp_gdma->tx_channel); gdma_del_channel(mcp_gdma->tx_channel); @@ -108,19 +102,19 @@ static esp_err_t mcp_gdma_destroy(async_memcpy_gdma_context_t *mcp_gdma) } static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_config_t *config, async_memcpy_handle_t *mcp, - esp_err_t (*new_channel)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *), + esp_err_t (*new_channel_func)(const gdma_channel_alloc_config_t *, gdma_channel_handle_t *), int gdma_bus_id) { esp_err_t ret = ESP_OK; async_memcpy_gdma_context_t *mcp_gdma = NULL; ESP_RETURN_ON_FALSE(config && mcp, ESP_ERR_INVALID_ARG, TAG, "invalid argument"); - // allocate memory of driver context from internal memory + + // allocate memory of driver context from internal memory (because it contains atomic variable) mcp_gdma = heap_caps_calloc(1, sizeof(async_memcpy_gdma_context_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); ESP_GOTO_ON_FALSE(mcp_gdma, ESP_ERR_NO_MEM, err, TAG, "no mem for driver context"); uint32_t trans_queue_len = config->backlog ? config->backlog : DEFAULT_TRANSACTION_QUEUE_LENGTH; - // allocate memory for transaction pool from internal memory because transaction structure contains DMA descriptor - mcp_gdma->transaction_pool = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, trans_queue_len, sizeof(async_memcpy_transaction_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); + // allocate memory for transaction pool from internal memory + mcp_gdma->transaction_pool = heap_caps_calloc(trans_queue_len, sizeof(async_memcpy_transaction_t), MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); ESP_GOTO_ON_FALSE(mcp_gdma->transaction_pool, ESP_ERR_NO_MEM, err, TAG, "no mem for transaction pool"); // create TX channel and RX channel, they should reside in the same DMA pair @@ -128,29 +122,40 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi .flags.reserve_sibling = 1, .direction = GDMA_CHANNEL_DIRECTION_TX, }; - ESP_GOTO_ON_ERROR(new_channel(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to create GDMA TX channel"); + ESP_GOTO_ON_ERROR(new_channel_func(&tx_alloc_config, &mcp_gdma->tx_channel), err, TAG, "failed to alloc GDMA TX channel"); gdma_channel_alloc_config_t rx_alloc_config = { .direction = GDMA_CHANNEL_DIRECTION_RX, .sibling_chan = mcp_gdma->tx_channel, }; - ESP_GOTO_ON_ERROR(new_channel(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to create GDMA RX channel"); + ESP_GOTO_ON_ERROR(new_channel_func(&rx_alloc_config, &mcp_gdma->rx_channel), err, TAG, "failed to alloc GDMA RX channel"); - // initialize GDMA channels - gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0); // get a free DMA trigger ID for memory copy + gdma_trigger_t m2m_trigger = GDMA_MAKE_TRIGGER(GDMA_TRIG_PERIPH_M2M, 0); uint32_t free_m2m_id_mask = 0; gdma_get_free_m2m_trig_id_mask(mcp_gdma->tx_channel, &free_m2m_id_mask); m2m_trigger.instance_id = __builtin_ctz(free_m2m_id_mask); ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->rx_channel, m2m_trigger), err, TAG, "GDMA rx connect failed"); ESP_GOTO_ON_ERROR(gdma_connect(mcp_gdma->tx_channel, m2m_trigger), err, TAG, "GDMA tx connect failed"); + gdma_strategy_config_t strategy_cfg = { + .owner_check = true, + .auto_update_desc = true, + .eof_till_data_popped = true, + }; + gdma_apply_strategy(mcp_gdma->tx_channel, &strategy_cfg); + gdma_apply_strategy(mcp_gdma->rx_channel, &strategy_cfg); + gdma_transfer_config_t transfer_cfg = { - .max_data_burst_size = config->dma_burst_size ? config->dma_burst_size : 16, + .max_data_burst_size = config->dma_burst_size, .access_ext_mem = true, // allow to do memory copy from/to external memory }; ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->tx_channel, &transfer_cfg), err, TAG, "config transfer for tx channel failed"); ESP_GOTO_ON_ERROR(gdma_config_transfer(mcp_gdma->rx_channel, &transfer_cfg), err, TAG, "config transfer for rx channel failed"); + // get the buffer alignment required by the GDMA channel + gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment); + gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment); + // register rx eof callback gdma_rx_event_callbacks_t cbs = { .on_recv_eof = mcp_gdma_rx_eof_callback, @@ -169,20 +174,14 @@ static esp_err_t esp_async_memcpy_install_gdma_template(const async_memcpy_confi portMUX_INITIALIZE(&mcp_gdma->spin_lock); atomic_init(&mcp_gdma->fsm, MCP_FSM_IDLE); mcp_gdma->gdma_bus_id = gdma_bus_id; + mcp_gdma->num_trans_objs = trans_queue_len; - // get the buffer alignment required by the GDMA channel - gdma_get_alignment_constraints(mcp_gdma->rx_channel, &mcp_gdma->rx_int_mem_alignment, &mcp_gdma->rx_ext_mem_alignment); - gdma_get_alignment_constraints(mcp_gdma->tx_channel, &mcp_gdma->tx_int_mem_alignment, &mcp_gdma->tx_ext_mem_alignment); - - size_t buf_align = MAX(MAX(mcp_gdma->rx_int_mem_alignment, mcp_gdma->rx_ext_mem_alignment), - MAX(mcp_gdma->tx_int_mem_alignment, mcp_gdma->tx_ext_mem_alignment)); - mcp_gdma->max_single_dma_buffer = ALIGN_DOWN(DMA_DESCRIPTOR_BUFFER_MAX_SIZE, buf_align); mcp_gdma->parent.del = mcp_gdma_del; mcp_gdma->parent.memcpy = mcp_gdma_memcpy; #if SOC_GDMA_SUPPORT_ETM mcp_gdma->parent.new_etm_event = mcp_new_etm_event; #endif - // return driver object + // return base object *mcp = &mcp_gdma->parent; return ESP_OK; @@ -227,61 +226,6 @@ static esp_err_t mcp_gdma_del(async_memcpy_context_t *ctx) return mcp_gdma_destroy(mcp_gdma); } -static void mount_tx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc, - uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer) -{ - mcp_dma_descriptor_t *desc_array = trans->tx_desc_link; - mcp_dma_descriptor_t *desc_nc = trans->tx_desc_nc; - uint32_t prepared_length = 0; - size_t len = buf_sz; - for (int i = 0; i < num_desc - 1; i++) { - desc_nc[i].buffer = &buf[prepared_length]; - desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[i].dw0.suc_eof = 0; - desc_nc[i].dw0.size = max_single_dma_buffer; - desc_nc[i].dw0.length = max_single_dma_buffer; - desc_nc[i].next = &desc_array[i + 1]; - prepared_length += max_single_dma_buffer; - len -= max_single_dma_buffer; - } - // take special care to the EOF descriptor - desc_nc[num_desc - 1].buffer = &buf[prepared_length]; - desc_nc[num_desc - 1].next = NULL; - desc_nc[num_desc - 1].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[num_desc - 1].dw0.suc_eof = 1; - desc_nc[num_desc - 1].dw0.size = len; - desc_nc[num_desc - 1].dw0.length = len; -} - -static void mount_rx_buffer_to_dma(async_memcpy_transaction_t *trans, int num_desc, - uint8_t *buf, size_t buf_sz, size_t max_single_dma_buffer) -{ - mcp_dma_descriptor_t *desc_array = trans->rx_desc_link; - mcp_dma_descriptor_t *desc_nc = trans->rx_desc_nc; - mcp_dma_descriptor_t *eof_desc = &trans->eof_node; - mcp_dma_descriptor_t *eof_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(eof_desc); - uint32_t prepared_length = 0; - size_t len = buf_sz; - if (desc_array) { - assert(num_desc > 0); - for (int i = 0; i < num_desc; i++) { - desc_nc[i].buffer = &buf[prepared_length]; - desc_nc[i].dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - desc_nc[i].dw0.size = max_single_dma_buffer; - desc_nc[i].dw0.length = max_single_dma_buffer; - desc_nc[i].next = &desc_array[i + 1]; - prepared_length += max_single_dma_buffer; - len -= max_single_dma_buffer; - } - desc_nc[num_desc - 1].next = eof_desc; - } - eof_nc->buffer = &buf[prepared_length]; - eof_nc->next = NULL; - eof_nc->dw0.owner = DMA_DESCRIPTOR_BUFFER_OWNER_DMA; - eof_nc->dw0.size = len; - eof_nc->dw0.length = len; -} - /// @brief help function to get one transaction from the ready queue /// @note this function is allowed to be called in ISR static async_memcpy_transaction_t *try_pop_trans_from_ready_queue(async_memcpy_gdma_context_t *mcp_gdma) @@ -306,8 +250,9 @@ static void try_start_pending_transaction(async_memcpy_gdma_context_t *mcp_gdma) trans = try_pop_trans_from_ready_queue(mcp_gdma); if (trans) { atomic_store(&mcp_gdma->fsm, MCP_FSM_RUN); - gdma_start(mcp_gdma->rx_channel, trans->rx_start_desc_addr); - gdma_start(mcp_gdma->tx_channel, trans->tx_start_desc_addr); + mcp_gdma->current_transaction = trans; + gdma_start(mcp_gdma->rx_channel, gdma_link_get_head_addr(trans->rx_link_list)); + gdma_start(mcp_gdma->tx_channel, gdma_link_get_head_addr(trans->tx_link_list)); } else { atomic_store(&mcp_gdma->fsm, MCP_FSM_IDLE); } @@ -328,6 +273,7 @@ static async_memcpy_transaction_t *try_pop_trans_from_idle_queue(async_memcpy_gd return trans; } +/// @brief Check if the address and size can meet the requirement of the DMA engine static bool check_buffer_alignment(async_memcpy_gdma_context_t *mcp_gdma, void *src, void *dst, size_t n) { bool valid = true; @@ -355,19 +301,26 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s { esp_err_t ret = ESP_OK; async_memcpy_gdma_context_t *mcp_gdma = __containerof(ctx, async_memcpy_gdma_context_t, parent); + size_t dma_link_item_alignment = 4; // buffer location check -#if SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM +#if SOC_AHB_GDMA_SUPPORTED if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AHB) { +#if !SOC_AHB_GDMA_SUPPORT_PSRAM ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AHB GDMA can only access SRAM"); +#endif // !SOC_AHB_GDMA_SUPPORT_PSRAM + dma_link_item_alignment = GDMA_LL_AHB_DESC_ALIGNMENT; } -#endif // SOC_AHB_GDMA_SUPPORTED && !SOC_AHB_GDMA_SUPPORT_PSRAM -#if SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM +#endif // SOC_AHB_GDMA_SUPPORTED +#if SOC_AXI_GDMA_SUPPORTED if (mcp_gdma->gdma_bus_id == SOC_GDMA_BUS_AXI) { - ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI DMA can only access SRAM"); +#if !SOC_AXI_GDMA_SUPPORT_PSRAM + ESP_RETURN_ON_FALSE(esp_ptr_internal(src) && esp_ptr_internal(dst), ESP_ERR_INVALID_ARG, TAG, "AXI GDMA can only access SRAM"); +#endif // !SOC_AXI_GDMA_SUPPORT_PSRAM + dma_link_item_alignment = GDMA_LL_AXI_DESC_ALIGNMENT; } -#endif // SOC_AXI_GDMA_SUPPORTED && !SOC_AXI_GDMA_SUPPORT_PSRAM +#endif // SOC_AXI_GDMA_SUPPORTED // alignment check - ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "buffer not aligned: %p -> %p, sz=%zu", src, dst, n); + ESP_RETURN_ON_FALSE(check_buffer_alignment(mcp_gdma, src, dst, n), ESP_ERR_INVALID_ARG, TAG, "address|size not aligned: %p -> %p, sz=%zu", src, dst, n); async_memcpy_transaction_t *trans = NULL; // pick one transaction node from idle queue @@ -375,51 +328,84 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s // check if we get the transaction object successfully ESP_RETURN_ON_FALSE(trans, ESP_ERR_INVALID_STATE, TAG, "no free node in the idle queue"); - // calculate how many descriptors we want - size_t max_single_dma_buffer = mcp_gdma->max_single_dma_buffer; - uint32_t num_desc_per_path = (n + max_single_dma_buffer - 1) / max_single_dma_buffer; - // allocate DMA descriptors from internal memory - trans->tx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path, sizeof(mcp_dma_descriptor_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); - ESP_GOTO_ON_FALSE(trans->tx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors"); - trans->tx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->tx_desc_link); - // don't have to allocate the EOF descriptor, we will use trans->eof_node as the RX EOF descriptor - if (num_desc_per_path > 1) { - trans->rx_desc_link = heap_caps_aligned_calloc(MCP_DMA_DESC_ALIGN, num_desc_per_path - 1, sizeof(mcp_dma_descriptor_t), - MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT | MALLOC_CAP_DMA); - ESP_GOTO_ON_FALSE(trans->rx_desc_link, ESP_ERR_NO_MEM, err, TAG, "no mem for DMA descriptors"); - trans->rx_desc_nc = (mcp_dma_descriptor_t *)MCP_GET_NON_CACHE_ADDR(trans->rx_desc_link); - } else { - // small copy buffer, use the trans->eof_node is sufficient - trans->rx_desc_link = NULL; - trans->rx_desc_nc = NULL; + // clean up the transaction configuration comes from the last one + if (trans->tx_link_list) { + gdma_del_link_list(trans->tx_link_list); + trans->tx_link_list = NULL; + } + if (trans->rx_link_list) { + gdma_del_link_list(trans->rx_link_list); + trans->rx_link_list = NULL; + } + if (trans->stash_buffer) { + free(trans->stash_buffer); + trans->stash_buffer = NULL; } - // (preload) mount src data to the TX descriptor - mount_tx_buffer_to_dma(trans, num_desc_per_path, src, n, max_single_dma_buffer); - // (preload) mount dst data to the RX descriptor - mount_rx_buffer_to_dma(trans, num_desc_per_path - 1, dst, n, max_single_dma_buffer); + // allocate gdma TX link + gdma_link_list_config_t tx_link_cfg = { + .buffer_alignment = esp_ptr_internal(src) ? mcp_gdma->tx_int_mem_alignment : mcp_gdma->tx_ext_mem_alignment, + .item_alignment = dma_link_item_alignment, + .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1, + .flags = { + .check_owner = true, + .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory + }, + }; + ESP_GOTO_ON_ERROR(gdma_new_link_list(&tx_link_cfg, &trans->tx_link_list), err, TAG, "failed to create TX link list"); + // mount the source buffer to the TX link list + gdma_buffer_mount_config_t tx_buf_mount_config[1] = { + [0] = { + .buffer = src, + .length = n, + .flags = { + .mark_eof = true, // mark the last item as EOF, so the RX channel can also received an EOF list item + .mark_final = true, // using singly list, so terminate the link here + } + } + }; + gdma_link_mount_buffers(trans->tx_link_list, 0, tx_buf_mount_config, 1, NULL); - // if the data is in the cache, write back, then DMA can see the latest data + // read the cache line size of internal and external memory, we use this information to check if a given memory is behind the cache + // write back the source data if it's behind the cache + size_t int_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_INT_MEM, CACHE_TYPE_DATA); + size_t ext_mem_cache_line_size = cache_hal_get_cache_line_size(CACHE_LL_LEVEL_EXT_MEM, CACHE_TYPE_DATA); bool need_write_back = false; if (esp_ptr_external_ram(src)) { - need_write_back = true; + need_write_back = ext_mem_cache_line_size > 0; } else if (esp_ptr_internal(src)) { -#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE - need_write_back = true; -#endif + need_write_back = int_mem_cache_line_size > 0; } if (need_write_back) { - esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M); + esp_cache_msync(src, n, ESP_CACHE_MSYNC_FLAG_DIR_C2M | ESP_CACHE_MSYNC_FLAG_UNALIGNED); + } + + // allocate gdma RX link + gdma_link_list_config_t rx_link_cfg = { + .buffer_alignment = esp_ptr_internal(dst) ? mcp_gdma->rx_int_mem_alignment : mcp_gdma->rx_ext_mem_alignment, + .item_alignment = dma_link_item_alignment, + .num_items = n / MCP_DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 3, + .flags = { + .check_owner = true, + .items_in_ext_mem = false, // TODO: if the memcopy size is too large, we may need to allocate the link list items from external memory + }, + }; + ESP_GOTO_ON_ERROR(gdma_new_link_list(&rx_link_cfg, &trans->rx_link_list), err, TAG, "failed to create RX link list"); + + // if the destination buffer address is not cache line aligned, we need to split the buffer into cache line aligned ones + ESP_GOTO_ON_ERROR(esp_dma_split_rx_buffer_to_cache_aligned(dst, n, &trans->rx_buf_array, &trans->stash_buffer), + err, TAG, "failed to split RX buffer into aligned ones"); + // mount the destination buffer to the RX link list + gdma_buffer_mount_config_t rx_buf_mount_config[3] = {0}; + for (int i = 0; i < 3; i++) { + rx_buf_mount_config[i].buffer = trans->rx_buf_array.aligned_buffer[i].aligned_buffer; + rx_buf_mount_config[i].length = trans->rx_buf_array.aligned_buffer[i].length; } + gdma_link_mount_buffers(trans->rx_link_list, 0, rx_buf_mount_config, 3, NULL); // save other transaction context trans->cb = cb_isr; trans->cb_args = cb_args; - trans->memcpy_size = n; - trans->memcpy_dst_addr = dst; // save the destination buffer address, because we may need to do data cache invalidate later - trans->tx_start_desc_addr = (intptr_t)trans->tx_desc_link; - trans->rx_start_desc_addr = trans->rx_desc_link ? (intptr_t)trans->rx_desc_link : (intptr_t)&trans->eof_node; portENTER_CRITICAL(&mcp_gdma->spin_lock); // insert the trans to ready queue @@ -433,14 +419,6 @@ static esp_err_t mcp_gdma_memcpy(async_memcpy_context_t *ctx, void *dst, void *s err: if (trans) { - if (trans->tx_desc_link) { - free(trans->tx_desc_link); - trans->tx_desc_link = NULL; - } - if (trans->rx_desc_link) { - free(trans->rx_desc_link); - trans->rx_desc_link = NULL; - } // return back the trans to idle queue portENTER_CRITICAL(&mcp_gdma->spin_lock); STAILQ_INSERT_TAIL(&mcp_gdma->idle_queue_head, trans, idle_queue_entry); @@ -453,26 +431,14 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_ { bool need_yield = false; async_memcpy_gdma_context_t *mcp_gdma = (async_memcpy_gdma_context_t *)user_data; - mcp_dma_descriptor_t *eof_desc = (mcp_dma_descriptor_t *)event_data->rx_eof_desc_addr; - // get the transaction object address by the EOF descriptor address - async_memcpy_transaction_t *trans = __containerof(eof_desc, async_memcpy_transaction_t, eof_node); + async_memcpy_transaction_t *trans = mcp_gdma->current_transaction; + dma_buffer_split_array_t *rx_buf_array = &trans->rx_buf_array; // switch driver state from RUN to IDLE async_memcpy_fsm_t expected_fsm = MCP_FSM_RUN; if (atomic_compare_exchange_strong(&mcp_gdma->fsm, &expected_fsm, MCP_FSM_IDLE_WAIT)) { - void *dst = trans->memcpy_dst_addr; - // if the data is in the cache, invalidate, then CPU can see the latest data - bool need_invalidate = false; - if (esp_ptr_external_ram(dst)) { - need_invalidate = true; - } else if (esp_ptr_internal(dst)) { -#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE - need_invalidate = true; -#endif - } - if (need_invalidate) { - esp_cache_msync(dst, trans->memcpy_size, ESP_CACHE_MSYNC_FLAG_DIR_M2C); - } + // merge the cache aligned buffers to the original buffer + esp_dma_merge_aligned_rx_buffers(rx_buf_array); // invoked callback registered by user async_memcpy_isr_cb_t cb = trans->cb; @@ -482,15 +448,6 @@ static bool mcp_gdma_rx_eof_callback(gdma_channel_handle_t dma_chan, gdma_event_ }; need_yield = cb(&mcp_gdma->parent, &e, trans->cb_args); } - // recycle descriptor memory - if (trans->tx_desc_link) { - free(trans->tx_desc_link); - trans->tx_desc_link = NULL; - } - if (trans->rx_desc_link) { - free(trans->rx_desc_link); - trans->rx_desc_link = NULL; - } trans->cb = NULL; portENTER_CRITICAL_ISR(&mcp_gdma->spin_lock); diff --git a/components/esp_hw_support/dma/esp_async_memcpy_priv.h b/components/esp_hw_support/dma/esp_async_memcpy_priv.h index bf64f83495f6..3b85b77a9e41 100644 --- a/components/esp_hw_support/dma/esp_async_memcpy_priv.h +++ b/components/esp_hw_support/dma/esp_async_memcpy_priv.h @@ -13,8 +13,6 @@ #include "esp_async_memcpy.h" #include "soc/soc_caps.h" -#define ALIGN_DOWN(val, align) ((val) & ~((align) - 1)) - #define DEFAULT_TRANSACTION_QUEUE_LENGTH 4 #ifdef __cplusplus diff --git a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c index ad097509b89e..e2c7a370ddc0 100644 --- a/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c +++ b/components/esp_hw_support/test_apps/dma/main/test_async_memcpy.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: 2021-2024 Espressif Systems (Shanghai) CO LTD + * SPDX-FileCopyrightText: 2021-2025 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ @@ -8,27 +8,21 @@ #include #include #include +#include "unity.h" +#include "soc/soc_caps.h" #include "esp_heap_caps.h" -#include "esp_rom_sys.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "freertos/semphr.h" -#include "unity.h" #include "ccomp_timer.h" #include "esp_async_memcpy.h" -#include "soc/soc_caps.h" -#include "hal/dma_types.h" +#if SOC_GDMA_SUPPORTED +#include "hal/gdma_ll.h" +#endif #define IDF_LOG_PERFORMANCE(item, value_fmt, value, ...) \ printf("[Performance][%s]: " value_fmt "\n", item, value, ##__VA_ARGS__) -#define ALIGN_UP(addr, align) (((addr) + (align)-1) & ~((align)-1)) -#define ALIGN_DOWN(size, align) ((size) & ~((align) - 1)) - -#if CONFIG_IDF_TARGET_ESP32P4 -#define TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE 1 -#endif - typedef struct { uint32_t seed; size_t buffer_size; @@ -37,8 +31,9 @@ typedef struct { uint8_t *dst_buf; uint8_t *from_addr; uint8_t *to_addr; - uint32_t align; - uint32_t offset; + uint32_t align; // alignment required by DMA engine + uint32_t src_offset; + uint32_t dst_offset; bool src_in_psram; bool dst_in_psram; } memcpy_testbench_context_t; @@ -46,7 +41,6 @@ typedef struct { static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_context) { srand(test_context->seed); - printf("allocating memory buffer...\r\n"); size_t buffer_size = test_context->buffer_size; size_t copy_size = buffer_size; uint8_t *src_buf = NULL; @@ -63,13 +57,11 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex TEST_ASSERT_NOT_NULL(dst_buf); // adding extra offset - from_addr = src_buf + test_context->offset; - to_addr = dst_buf; - copy_size -= test_context->offset; - copy_size &= ~(test_context->align - 1); + from_addr = src_buf + test_context->src_offset; + to_addr = dst_buf + test_context->dst_offset; + copy_size -= MAX(test_context->src_offset, test_context->dst_offset); - printf("...to copy size %zu Bytes, from @%p, to @%p\r\n", copy_size, from_addr, to_addr); - printf("fill src buffer with random data\r\n"); + printf("copy @%p --> @%p, %zu Bytes\r\n", from_addr, to_addr, copy_size); for (int i = 0; i < copy_size; i++) { from_addr[i] = rand() % 256; } @@ -82,28 +74,23 @@ static void async_memcpy_setup_testbench(memcpy_testbench_context_t *test_contex test_context->to_addr = to_addr; } -static void async_memcpy_verify_and_clear_testbench(uint32_t seed, uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr) +static void async_memcpy_verify_and_clear_testbench(uint32_t copy_size, uint8_t *src_buf, uint8_t *dst_buf, uint8_t *from_addr, uint8_t *to_addr) { - srand(seed); // check if source date has been copied to destination and source data not broken for (int i = 0; i < copy_size; i++) { - TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, from_addr[i], "source data doesn't match generator data"); - } - srand(seed); - for (int i = 0; i < copy_size; i++) { - TEST_ASSERT_EQUAL_MESSAGE(rand() % 256, to_addr[i], "destination data doesn't match source data"); + if (from_addr[i] != to_addr[i]) { + printf("location[%d]:s=%d,d=%d\r\n", i, from_addr[i], to_addr[i]); + TEST_FAIL_MESSAGE("destination data doesn't match source data"); + } } free(src_buf); free(dst_buf); } -TEST_CASE("memory copy the same buffer with different content", "[async mcp]") +static void test_memory_copy_with_same_buffer(async_memcpy_handle_t driver) { - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - uint8_t *sbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - uint8_t *dbuf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + uint8_t *sbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); + uint8_t *dbuf = heap_caps_calloc(1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); TEST_ASSERT_NOT_NULL(sbuf); TEST_ASSERT_NOT_NULL(dbuf); @@ -119,289 +106,274 @@ TEST_CASE("memory copy the same buffer with different content", "[async mcp]") } } } - TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); free(sbuf); free(dbuf); } -static void test_memory_copy_one_by_one(async_memcpy_handle_t driver) +TEST_CASE("memory copy the same buffer with different content", "[async mcp]") { - uint32_t aligned_test_buffer_size[] = {256, 512, 1024, 2048, 4096}; - memcpy_testbench_context_t test_context = { - .align = 4, - }; + async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); + async_memcpy_handle_t driver = NULL; + +#if SOC_AHB_GDMA_SUPPORTED + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver)); + test_memory_copy_with_same_buffer(driver); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - test_context.buffer_size = aligned_test_buffer_size[i]; - test_context.seed = i; - test_context.offset = 0; - async_memcpy_setup_testbench(&test_context); +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver)); + test_memory_copy_with_same_buffer(driver); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL)); - vTaskDelay(pdMS_TO_TICKS(10)); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, - test_context.dst_buf, test_context.from_addr, test_context.to_addr); - } +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver)); + test_memory_copy_with_same_buffer(driver); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED +} -#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE - uint32_t unaligned_test_buffer_size[] = {255, 511, 1023, 2047, 4095, 5011}; - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { +static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) +{ + SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args; + BaseType_t high_task_wakeup = pdFALSE; + xSemaphoreGiveFromISR(sem, &high_task_wakeup); + return high_task_wakeup == pdTRUE; +} + +static void test_memory_copy_blocking(async_memcpy_handle_t driver) +{ + SemaphoreHandle_t sem = xSemaphoreCreateBinary(); + const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012}; + memcpy_testbench_context_t test_context = { + .align = 4, + }; + for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) { // Test different align edge for (int off = 0; off < 4; off++) { - test_context.buffer_size = unaligned_test_buffer_size[i]; + test_context.buffer_size = test_buffer_size[i]; test_context.seed = i; - test_context.offset = off; + test_context.src_offset = off; + test_context.dst_offset = off; async_memcpy_setup_testbench(&test_context); - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, NULL, NULL)); - vTaskDelay(pdMS_TO_TICKS(10)); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, - test_context.dst_buf, test_context.from_addr, test_context.to_addr); + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem)); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10))); + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, + test_context.from_addr, test_context.to_addr); } } -#endif + vSemaphoreDelete(sem); } -TEST_CASE("memory copy by DMA one by one", "[async mcp]") +TEST_CASE("memory copy by DMA (blocking)", "[async mcp]") { async_memcpy_config_t config = { - .backlog = 4, + .backlog = 1, + .dma_burst_size = 0, }; async_memcpy_handle_t driver = NULL; #if SOC_AHB_GDMA_SUPPORTED - printf("Testing memory by AHB GDMA\r\n"); + printf("Testing memcpy by AHB GDMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_blocking(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_AHB_GDMA_SUPPORTED #if SOC_AXI_GDMA_SUPPORTED - printf("Testing memory by AXI GDMA\r\n"); + printf("Testing memcpy by AXI GDMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_blocking(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_AXI_GDMA_SUPPORTED #if SOC_CP_DMA_SUPPORTED - printf("Testing memory by CP DMA\r\n"); + printf("Testing memcpy by CP DMA\r\n"); TEST_ESP_OK(esp_async_memcpy_install_cpdma(&config, &driver)); - test_memory_copy_one_by_one(driver); + test_memory_copy_blocking(driver); TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); #endif // SOC_CP_DMA_SUPPORTED - -} - -static bool test_async_memcpy_cb_v1(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) -{ - SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args; - BaseType_t high_task_wakeup = pdFALSE; - xSemaphoreGiveFromISR(sem, &high_task_wakeup); - return high_task_wakeup == pdTRUE; } -TEST_CASE("memory copy done callback", "[async mcp]") +[[maybe_unused]] static void test_memcpy_with_dest_addr_unaligned(async_memcpy_handle_t driver, bool src_in_psram, bool dst_in_psram) { - async_memcpy_config_t config = { - // all default + SemaphoreHandle_t sem = xSemaphoreCreateBinary(); + const uint32_t test_buffer_size[] = {256, 512, 1024, 2048, 4096, 5012}; + memcpy_testbench_context_t test_context = { + .align = 4, + .src_in_psram = src_in_psram, + .dst_in_psram = dst_in_psram, }; - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - - uint8_t *src_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - uint8_t *dst_buf = heap_caps_aligned_calloc(4, 1, 256, MALLOC_CAP_DMA | MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT); - TEST_ASSERT_NOT_NULL(src_buf); - TEST_ASSERT_NOT_NULL(dst_buf); + for (int i = 0; i < sizeof(test_buffer_size) / sizeof(test_buffer_size[0]); i++) { + // Test different alignment + for (int off = 0; off < 4; off++) { + test_context.buffer_size = test_buffer_size[i]; + test_context.seed = i; + test_context.src_offset = off; + test_context.dst_offset = off + 1; + async_memcpy_setup_testbench(&test_context); - SemaphoreHandle_t sem = xSemaphoreCreateBinary(); - TEST_ESP_OK(esp_async_memcpy(driver, dst_buf, src_buf, 256, test_async_memcpy_cb_v1, sem)); - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); - free(src_buf); - free(dst_buf); + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_cb_v1, sem)); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(10))); + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, + test_context.from_addr, test_context.to_addr); + } + } vSemaphoreDelete(sem); } -TEST_CASE("memory copy by DMA on the fly", "[async mcp]") +TEST_CASE("memory copy with dest address unaligned", "[async mcp]") { - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - async_memcpy_handle_t driver = NULL; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - - uint32_t aligned_test_buffer_size[] = {512, 1024, 2048, 4096, 4608}; - memcpy_testbench_context_t test_context[5] = { - [0 ... 4] = { - .align = 4, - } + [[maybe_unused]] async_memcpy_config_t driver_config = { + .backlog = 4, + .dma_burst_size = 32, }; + [[maybe_unused]] async_memcpy_handle_t driver = NULL; - // Aligned case - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - test_context[i].seed = i; - test_context[i].buffer_size = aligned_test_buffer_size[i]; - async_memcpy_setup_testbench(&test_context[i]); - } - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL)); - } - for (int i = 0; i < sizeof(aligned_test_buffer_size) / sizeof(aligned_test_buffer_size[0]); i++) { - async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr); - } -#if !TEST_MEMCPY_BUFFER_SIZE_MUST_ALIGN_CACHE - uint32_t unaligned_test_buffer_size[] = {511, 1023, 2047, 4095, 5011}; - // Non-aligned case - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - test_context[i].seed = i; - test_context[i].buffer_size = unaligned_test_buffer_size[i]; - test_context[i].offset = 3; - async_memcpy_setup_testbench(&test_context[i]); - } - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context[i].to_addr, test_context[i].from_addr, test_context[i].copy_size, NULL, NULL)); - } - for (int i = 0; i < sizeof(unaligned_test_buffer_size) / sizeof(unaligned_test_buffer_size[0]); i++) { - async_memcpy_verify_and_clear_testbench(i, test_context[i].copy_size, test_context[i].src_buf, test_context[i].dst_buf, test_context[i].from_addr, test_context[i].to_addr); - } -#endif +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED + +#if SOC_AHB_GDMA_SUPPORTED && !GDMA_LL_AHB_RX_BURST_NEEDS_ALIGNMENT + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); +#if SOC_AHB_GDMA_SUPPORT_PSRAM + test_memcpy_with_dest_addr_unaligned(driver, true, true); +#endif // SOC_AHB_GDMA_SUPPORT_PSRAM + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_with_dest_addr_unaligned(driver, false, false); +#if SOC_AXI_GDMA_SUPPORT_PSRAM + test_memcpy_with_dest_addr_unaligned(driver, true, true); +#endif // SOC_AXI_GDMA_SUPPORT_PSRAM TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED } -#define TEST_ASYNC_MEMCPY_BENCH_COUNTS (8) -static int s_count = 0; +#define TEST_ASYNC_MEMCPY_BENCH_COUNTS 16 + +typedef struct { + int perf_count; + SemaphoreHandle_t sem; +} mcp_perf_user_context_t; -static IRAM_ATTR bool test_async_memcpy_isr_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) +static IRAM_ATTR bool test_async_memcpy_perf_cb(async_memcpy_handle_t mcp_hdl, async_memcpy_event_t *event, void *cb_args) { - SemaphoreHandle_t sem = (SemaphoreHandle_t)cb_args; + mcp_perf_user_context_t* user = (mcp_perf_user_context_t*)cb_args; BaseType_t high_task_wakeup = pdFALSE; - s_count++; - if (s_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) { - xSemaphoreGiveFromISR(sem, &high_task_wakeup); + user->perf_count++; + if (user->perf_count == TEST_ASYNC_MEMCPY_BENCH_COUNTS) { + xSemaphoreGiveFromISR(user->sem, &high_task_wakeup); } return high_task_wakeup == pdTRUE; } -static void memcpy_performance_test(uint32_t buffer_size) +static void test_memcpy_performance(async_memcpy_handle_t driver, uint32_t buffer_size, bool src_in_psram, bool dst_in_psram) { - SemaphoreHandle_t sem = xSemaphoreCreateBinary(); - - async_memcpy_config_t config = ASYNC_MEMCPY_DEFAULT_CONFIG(); - config.backlog = (buffer_size / DMA_DESCRIPTOR_BUFFER_MAX_SIZE + 1) * TEST_ASYNC_MEMCPY_BENCH_COUNTS; - config.dma_burst_size = 32; // set a big burst size for performance - async_memcpy_handle_t driver = NULL; int64_t elapse_us = 0; float throughput = 0.0; - TEST_ESP_OK(esp_async_memcpy_install(&config, &driver)); - // 1. SRAM->SRAM memcpy_testbench_context_t test_context = { - .align = config.dma_burst_size, + .align = 32, // set alignment same as the burst size, to achieve the best performance .buffer_size = buffer_size, - .src_in_psram = false, - .dst_in_psram = false, + .src_in_psram = src_in_psram, + .dst_in_psram = dst_in_psram, }; async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); - -#if SOC_AHB_GDMA_SUPPORT_PSRAM - // 2. PSRAM->PSRAM - test_context.src_in_psram = true; - test_context.dst_in_psram = true; - async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); - // 3. PSRAM->SRAM - test_context.src_in_psram = true; - test_context.dst_in_psram = false; - async_memcpy_setup_testbench(&test_context); - s_count = 0; - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); - } - // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); + // get CPU memcpy performance ccomp_timer_start(); for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); } elapse_us = ccomp_timer_stop(); throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: PSRAM->SRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM"); - // 4. SRAM->PSRAM - test_context.src_in_psram = false; - test_context.dst_in_psram = true; - async_memcpy_setup_testbench(&test_context); - s_count = 0; + // get DMA memcpy performance ccomp_timer_start(); + mcp_perf_user_context_t user_context = { + .perf_count = 0, + .sem = xSemaphoreCreateBinary() + }; for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_isr_cb, sem)); + TEST_ESP_OK(esp_async_memcpy(driver, test_context.to_addr, test_context.from_addr, test_context.copy_size, test_async_memcpy_perf_cb, &user_context)); } // wait for done semaphore - TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(sem, pdMS_TO_TICKS(1000))); + TEST_ASSERT_EQUAL(pdTRUE, xSemaphoreTake(user_context.sem, pdMS_TO_TICKS(1000))); elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - ccomp_timer_start(); - for (int i = 0; i < TEST_ASYNC_MEMCPY_BENCH_COUNTS; i++) { - memcpy(test_context.to_addr, test_context.from_addr, test_context.buffer_size); - } - elapse_us = ccomp_timer_stop(); - throughput = (float)test_context.buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; - IDF_LOG_PERFORMANCE("CPU_COPY", "%.2f MB/s, dir: SRAM->PSRAM, size: %zu Bytes", throughput, test_context.buffer_size); - async_memcpy_verify_and_clear_testbench(test_context.seed, test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); -#endif + async_memcpy_verify_and_clear_testbench(test_context.copy_size, test_context.src_buf, test_context.dst_buf, test_context.from_addr, test_context.to_addr); + throughput = (float)buffer_size * 1e6 * TEST_ASYNC_MEMCPY_BENCH_COUNTS / 1024 / 1024 / elapse_us; + IDF_LOG_PERFORMANCE("DMA_COPY", "%.2f MB/s, dir: %s->%s", throughput, src_in_psram ? "PSRAM" : "SRAM", dst_in_psram ? "PSRAM" : "SRAM"); - TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); - vSemaphoreDelete(sem); + vSemaphoreDelete(user_context.sem); } -TEST_CASE("memory copy performance test 40KB", "[async mcp]") +TEST_CASE("memory copy performance 40KB: SRAM->SRAM", "[async mcp]") { - memcpy_performance_test(40 * 1024); + async_memcpy_config_t driver_config = { + .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS, + .dma_burst_size = 32, + }; + async_memcpy_handle_t driver = NULL; + +#if SOC_AHB_GDMA_SUPPORTED + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED + +#if SOC_AXI_GDMA_SUPPORTED + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED + +#if SOC_CP_DMA_SUPPORTED + printf("Testing memcpy by CP DMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_cpdma(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, false, false); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_CP_DMA_SUPPORTED } -TEST_CASE("memory copy performance test 4KB", "[async mcp]") +#if SOC_SPIRAM_SUPPORTED +TEST_CASE("memory copy performance 40KB: PSRAM->PSRAM", "[async mcp]") { - memcpy_performance_test(4 * 1024); + [[maybe_unused]] async_memcpy_config_t driver_config = { + .backlog = TEST_ASYNC_MEMCPY_BENCH_COUNTS, + .dma_burst_size = 32, + }; + [[maybe_unused]] async_memcpy_handle_t driver = NULL; + +#if SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM + printf("Testing memcpy by AHB GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_ahb(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, true, true); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AHB_GDMA_SUPPORTED && SOC_AHB_GDMA_SUPPORT_PSRAM + +#if SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM + printf("Testing memcpy by AXI GDMA\r\n"); + TEST_ESP_OK(esp_async_memcpy_install_gdma_axi(&driver_config, &driver)); + test_memcpy_performance(driver, 40 * 1024, true, true); + TEST_ESP_OK(esp_async_memcpy_uninstall(driver)); +#endif // SOC_AXI_GDMA_SUPPORTED && SOC_AXI_GDMA_SUPPORT_PSRAM } +#endif diff --git a/components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32c5 b/components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32c5 new file mode 100644 index 000000000000..728fbe8889af --- /dev/null +++ b/components/esp_hw_support/test_apps/dma/sdkconfig.defaults.esp32c5 @@ -0,0 +1,2 @@ +CONFIG_SPIRAM=y +CONFIG_SPIRAM_SPEED_80M=y diff --git a/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in b/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in index 77670c4e895a..e62d2d135fe1 100644 --- a/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in +++ b/components/soc/esp32c5/include/soc/Kconfig.soc_caps.in @@ -479,6 +479,10 @@ config SOC_GDMA_SUPPORT_SLEEP_RETENTION bool default y +config SOC_AHB_GDMA_SUPPORT_PSRAM + bool + default y + config SOC_ETM_GROUPS int default 1 diff --git a/components/soc/esp32c5/include/soc/soc_caps.h b/components/soc/esp32c5/include/soc/soc_caps.h index a33e04ba2fa9..78f99228503e 100644 --- a/components/soc/esp32c5/include/soc/soc_caps.h +++ b/components/soc/esp32c5/include/soc/soc_caps.h @@ -190,6 +190,7 @@ #define SOC_GDMA_PAIRS_PER_GROUP_MAX 3 #define SOC_GDMA_SUPPORT_ETM 1 #define SOC_GDMA_SUPPORT_SLEEP_RETENTION 1 +#define SOC_AHB_GDMA_SUPPORT_PSRAM 1 /*-------------------------- ETM CAPS --------------------------------------*/ #define SOC_ETM_GROUPS 1U // Number of ETM groups