Skip to content

Commit 7fd8a6d

Browse files
projectgusdpgeorge
authored andcommitted
stm32/dma: Add D-cache protection for DMA RX operations, including SPI.
This new DMA API corrects possible cache coherency issues on chips with D-Cache, when working with buffers at arbitrary memory locations (i.e. supplied by Python code). The API is used by SPI to fix an issue with corrupt data when reading from SPI using DMA in certain cases. A regression test is included (it depends on external hardware connection). Explanation: 1) It's necessary to invalidate D-Cache after a DMA RX operation completes in case the CPU reads (or speculatively reads) from the DMA RX region during the operation. This seems to have been the root cause of issue micropython#13471 (only when src==dest for this case). 2) More generally, it is also necessary to temporarily mark the first and last cache lines of a DMA RX operation as "uncached", in case the DMA buffer shares this cache line with unrelated data. The CPU could otherwise write the other data at any time during the DMA operation (for example from an interrupt handler), creating a dirty cache line that's inconsistent with the DMA result. Fixes issue micropython#13471. This work was funded through GitHub Sponsors. Signed-off-by: Angus Gratton <[email protected]>
1 parent 2345c1a commit 7fd8a6d

File tree

6 files changed

+158
-2
lines changed

6 files changed

+158
-2
lines changed

ports/stm32/dma.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "systick.h"
3434
#include "dma.h"
3535
#include "irq.h"
36+
#include "mpu.h"
3637

3738
// When this option is enabled, the DMA will turn off automatically after
3839
// a period of inactivity.
@@ -1852,3 +1853,55 @@ void dma_external_acquire(uint32_t controller, uint32_t stream) {
18521853
void dma_external_release(uint32_t controller, uint32_t stream) {
18531854
dma_disable_clock(DMA_ID_FROM_CONTROLLER_STREAM(controller, stream));
18541855
}
1856+
1857+
#if __DCACHE_PRESENT
1858+
1859+
void dma_protect_rx_region(void *dest, size_t len) {
1860+
#if __DCACHE_PRESENT
1861+
uint32_t start_addr = (uint32_t)dest;
1862+
uint32_t start_aligned = start_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U);
1863+
uint32_t end_addr = start_addr + len - 1; // Address of last byte in the buffer
1864+
uint32_t end_aligned = end_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U);
1865+
1866+
uint32_t irq_state = mpu_config_start();
1867+
1868+
// Clean (write back) any cached memory in this region, so there's no dirty
1869+
// cache entries that might be written back later after DMA RX is done.
1870+
MP_HAL_CLEAN_DCACHE(dest, len);
1871+
1872+
// The way we protect the whole region is to mark the first and last cache
1873+
// line as UNCACHED using the MPU. This means any unrelated reads/writes in
1874+
// these cache lines will bypass the cache, and can coexist with DMA also
1875+
// writing to parts of these cache lines.
1876+
//
1877+
// This is redundant sometimes (because the DMA region fills the entire cache line, or because
1878+
// the region fits in a single cache line.) However, the implementation is only 3 register writes so
1879+
// it's more efficient to call it every time.
1880+
mpu_config_region(MPU_REGION_DMA_UNCACHED_1, start_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B));
1881+
mpu_config_region(MPU_REGION_DMA_UNCACHED_2, end_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B));
1882+
1883+
mpu_config_end(irq_state);
1884+
#endif
1885+
}
1886+
1887+
void dma_unprotect_rx_region(void *dest, size_t len) {
1888+
#if __DCACHE_PRESENT
1889+
uint32_t irq_state = mpu_config_start();
1890+
1891+
// Disabling these regions removes them from the MPU
1892+
mpu_config_region(MPU_REGION_DMA_UNCACHED_1, 0, MPU_CONFIG_DISABLE);
1893+
mpu_config_region(MPU_REGION_DMA_UNCACHED_2, 0, MPU_CONFIG_DISABLE);
1894+
1895+
// Invalidate the whole region in the cache. This may seem redundant, but it
1896+
// is possible that during the DMA operation the CPU read inside this region
1897+
// (excluding the first & last cache lines), and cache lines were filled.
1898+
//
1899+
// (This can happen in SPI if src==dest, for example, possibly due to speculative
1900+
// cache line fills.)
1901+
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
1902+
1903+
mpu_config_end(irq_state);
1904+
#endif
1905+
}
1906+
1907+
#endif

ports/stm32/dma.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#ifndef MICROPY_INCLUDED_STM32_DMA_H
2727
#define MICROPY_INCLUDED_STM32_DMA_H
2828

29+
#include "py/mpconfig.h"
30+
2931
typedef struct _dma_descr_t dma_descr_t;
3032

3133
#if defined(STM32H5)
@@ -164,4 +166,40 @@ void dma_nohal_start(const dma_descr_t *descr, uint32_t src_addr, uint32_t dst_a
164166
void dma_external_acquire(uint32_t controller, uint32_t stream);
165167
void dma_external_release(uint32_t controller, uint32_t stream);
166168

169+
#if __DCACHE_PRESENT
170+
// On chips with D-Cache, it's necessary to call this function before using DMA to read
171+
// into a user-supplied buffer. It does two things:
172+
//
173+
// 1) Cleans this region in the cache.
174+
//
175+
// 2) Temporarily disables caching for the first and last cache lines of the
176+
// buffer. This prevents cache coherency issues if the start or end of the
177+
// buffer don't align to a cache line. (For example, this can happen if the CPU
178+
// accesses unrelated memory adjacent to the buffer and in the same cache
179+
// line.)
180+
//
181+
// Only one region can be protected in this way at a time.
182+
void dma_protect_rx_region(void *dest, size_t len);
183+
184+
// Release the region protected by dma_protect_rx_region().
185+
//
186+
// Call this after the DMA operation completes, before the CPU reads any of the
187+
// data that was written.
188+
//
189+
// As well as restoring data caching, this function invalidates D-cache for the
190+
// entire specified region.
191+
void dma_unprotect_rx_region(void *dest, size_t len);
192+
193+
#else
194+
195+
inline static void dma_protect_rx_region(uint8_t *dest, size_t len) {
196+
// No-ops on targets without D-Cache.
197+
}
198+
199+
inline static void dma_unprotect_rx_region(void *dest, size_t len) {
200+
201+
}
202+
203+
#endif // __DCACHE_PRESENT
204+
167205
#endif // MICROPY_INCLUDED_STM32_DMA_H

ports/stm32/mpu.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
#define MPU_REGION_SDRAM1 (MPU_REGION_NUMBER4)
3838
#define MPU_REGION_SDRAM2 (MPU_REGION_NUMBER5)
3939

40+
// Only relevant on CPUs with D-Cache, must be higher priority than SDRAM
41+
#define MPU_REGION_DMA_UNCACHED_1 (MPU_REGION_NUMBER6)
42+
#define MPU_REGION_DMA_UNCACHED_2 (MPU_REGION_NUMBER7)
43+
4044
// Attribute value to disable a region entirely, remove it from the MPU
4145
// (i.e. the MPU_REGION_ENABLE bit is unset.)
4246
#define MPU_CONFIG_DISABLE 0
@@ -78,6 +82,18 @@
7882
| MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \
7983
)
8084

85+
#define MPU_CONFIG_UNCACHED(size) ( \
86+
MPU_INSTRUCTION_ACCESS_DISABLE << MPU_RASR_XN_Pos \
87+
| MPU_REGION_FULL_ACCESS << MPU_RASR_AP_Pos \
88+
| MPU_TEX_LEVEL1 << MPU_RASR_TEX_Pos \
89+
| MPU_ACCESS_NOT_SHAREABLE << MPU_RASR_S_Pos \
90+
| MPU_ACCESS_NOT_CACHEABLE << MPU_RASR_C_Pos \
91+
| MPU_ACCESS_NOT_BUFFERABLE << MPU_RASR_B_Pos \
92+
| 0x00 << MPU_RASR_SRD_Pos \
93+
| (size) << MPU_RASR_SIZE_Pos \
94+
| MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \
95+
)
96+
8197
static inline void mpu_init(void) {
8298
MPU->CTRL = MPU_PRIVILEGED_DEFAULT | MPU_CTRL_ENABLE_Msk;
8399
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;

ports/stm32/spi.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,8 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
571571
// Note: DMA transfers are limited to 65535 bytes at a time.
572572

573573
HAL_StatusTypeDef status;
574+
void *odest = dest; // Original values of dest & len
575+
size_t olen = len;
574576

575577
if (dest == NULL) {
576578
// send only
@@ -613,7 +615,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
613615
}
614616
dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi);
615617
self->spi->hdmarx = &rx_dma;
616-
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
618+
dma_protect_rx_region(dest, len);
617619
uint32_t t_start = HAL_GetTick();
618620
do {
619621
uint32_t l = MIN(len, 65535);
@@ -632,6 +634,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
632634
dma_deinit(self->tx_dma_descr);
633635
}
634636
dma_deinit(self->rx_dma_descr);
637+
dma_unprotect_rx_region(odest, olen);
635638
}
636639
} else {
637640
// send and receive
@@ -644,7 +647,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
644647
dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi);
645648
self->spi->hdmarx = &rx_dma;
646649
MP_HAL_CLEAN_DCACHE(src, len);
647-
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
650+
dma_protect_rx_region(dest, len);
648651
uint32_t t_start = HAL_GetTick();
649652
do {
650653
uint32_t l = MIN(len, 65535);
@@ -662,6 +665,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
662665
} while (len);
663666
dma_deinit(self->tx_dma_descr);
664667
dma_deinit(self->rx_dma_descr);
668+
dma_unprotect_rx_region(odest, olen);
665669
}
666670
}
667671

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from machine import SPI
2+
# Regression test for DMA for DCache coherency bugs with cache line
3+
# written originally for https://github.com/micropython/micropython/issues/13471
4+
5+
# IMPORTANT: This test requires SPI2 MISO (pin Y8 on Pyboard D) to be connected to GND
6+
7+
SPI_NUM = 2
8+
9+
spi = SPI(SPI_NUM, baudrate=5_000_000)
10+
buf = bytearray(1024)
11+
ok = True
12+
13+
for offs in range(0, len(buf)):
14+
v = memoryview(buf)[offs : offs + 128]
15+
spi.readinto(v, 0xFF)
16+
if not all(b == 0x00 for b in v):
17+
print(offs, v.hex())
18+
ok = False
19+
20+
print("Variable offset fixed length " + ("OK" if ok else "FAIL"))
21+
22+
# this takes around 30s to run, so skipped if already failing
23+
if ok:
24+
for op_len in range(1, 66):
25+
wr = b"\xFF" * op_len
26+
for offs in range(1, len(buf) - op_len - 1):
27+
# Place some "sentinel" values before and after the DMA buffer
28+
before = offs & 0xFF
29+
after = (~offs) & 0xFF
30+
buf[offs - 1] = before
31+
buf[offs + op_len] = after
32+
v = memoryview(buf)[offs : offs + op_len]
33+
spi.write_readinto(wr, v)
34+
if (
35+
not all(b == 0x00 for b in v)
36+
or buf[offs - 1] != before
37+
or buf[offs + op_len] != after
38+
):
39+
print(v.hex())
40+
print(hex(op_len), hex(offs), hex(buf[offs - 1]), hex(buf[offs + op_len]))
41+
ok = False
42+
43+
print("Variable offset and lengths " + ("OK" if ok else "FAIL"))
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Variable offset fixed length OK
2+
Variable offset and lengths OK

0 commit comments

Comments
 (0)