cancel
Showing results for 
Search instead for 
Did you mean: 

Missing D-Cache clean in "sd_diskio.c" SD_Read() causes SD card FATFS libraries to fail

mantisrobot
Associate II

I've been working with an F7 processor, using RTOS with the FATFS and an SD card in 4bit mode on SDMMC1.

STM32CubeIDE V1.8

STM32CubeMX V6.4.0

Within the sd_diskio.c file there is the following option to enable D-Cache maintenance when D-Cache is used:

/*
 * when using cacheable memory region, it may be needed to maintain the cache
 * validity. Enable the define below to activate a cache maintenance at each
 * read and write operation.
 * Notice: This is applicable only for cortex M7 based platform.
 */
/* USER CODE BEGIN enableSDDmaCacheMaintenance */
#define ENABLE_SD_DMA_CACHE_MAINTENANCE  1
/* USER CODE END enableSDDmaCacheMaintenance */

However, there seems to be a D-Cache clean missing within SD_read(), this is the generated code:

/* USER CODE BEGIN beforeReadSection */
/* can be used to modify previous code / undefine following code / add new code */
/* USER CODE END beforeReadSection */
/**
  * @brief  Reads Sector(s)
  * @param  lun : not used
  * @param  *buff: Data buffer to store read data
  * @param  sector: Sector address (LBA)
  * @param  count: Number of sectors to read (1..128)
  * @retval DRESULT: Operation result
  */
 
DRESULT SD_read(BYTE lun, BYTE *buff, DWORD sector, UINT count)
{
  uint8_t ret;
  DRESULT res = RES_ERROR;
  uint32_t timer;
#if (osCMSIS < 0x20000U)
  osEvent event;
#else
  uint16_t event;
  osStatus_t status;
#endif
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
  uint32_t alignedAddr;
#endif
  /*
  * ensure the SDCard is ready for a new operation
  */
 
  if (SD_CheckStatusWithTimeout(SD_TIMEOUT) < 0)
  {
    return res;
  }
 
#if defined(ENABLE_SCRATCH_BUFFER)
  if (!((uint32_t)buff & 0x3))
  {
#endif
    /* Fast path cause destination buffer is correctly aligned */
    ret = BSP_SD_ReadBlocks_DMA((uint32_t*)buff, (uint32_t)(sector), count);
 
    if (ret == MSD_OK) {
#if (osCMSIS < 0x20000U)
    /* wait for a message from the queue or a timeout */
    event = osMessageGet(SDQueueID, SD_TIMEOUT);
 
    if (event.status == osEventMessage)
    {
      if (event.value.v == READ_CPLT_MSG)
      {
        timer = osKernelSysTick();
        /* block until SDIO IP is ready or a timeout occur */
        while(osKernelSysTick() - timer <SD_TIMEOUT)
#else
          status = osMessageQueueGet(SDQueueID, (void *)&event, NULL, SD_TIMEOUT);
          if ((status == osOK) && (event == READ_CPLT_MSG))
          {
            timer = osKernelGetTickCount();
            /* block until SDIO IP is ready or a timeout occur */
            while(osKernelGetTickCount() - timer <SD_TIMEOUT)
#endif
            {
              if (BSP_SD_GetCardState() == SD_TRANSFER_OK)
              {
                res = RES_OK;
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
                /*
                the SCB_InvalidateDCache_by_Addr() requires a 32-Byte aligned address,
                adjust the address and the D-Cache size to invalidate accordingly.
                */
                alignedAddr = (uint32_t)buff & ~0x1F;
                SCB_InvalidateDCache_by_Addr((uint32_t*)alignedAddr, count*BLOCKSIZE + ((uint32_t)buff - alignedAddr));
#endif
                break;
              }
            }
#if (osCMSIS < 0x20000U)
          }
        }
#else
      }
#endif
    }
 
#if defined(ENABLE_SCRATCH_BUFFER)
    }
    else
    {
      /* Slow path, fetch each sector a part and memcpy to destination buffer */
      int i;
 
      for (i = 0; i < count; i++)
      {
        ret = BSP_SD_ReadBlocks_DMA((uint32_t*)scratch, (uint32_t)sector++, 1);
        if (ret == MSD_OK )
        {
          /* wait until the read is successful or a timeout occurs */
#if (osCMSIS < 0x20000U)
          /* wait for a message from the queue or a timeout */
          event = osMessageGet(SDQueueID, SD_TIMEOUT);
 
          if (event.status == osEventMessage)
          {
            if (event.value.v == READ_CPLT_MSG)
            {
              timer = osKernelSysTick();
              /* block until SDIO IP is ready or a timeout occur */
              while(osKernelSysTick() - timer <SD_TIMEOUT)
#else
                status = osMessageQueueGet(SDQueueID, (void *)&event, NULL, SD_TIMEOUT);
              if ((status == osOK) && (event == READ_CPLT_MSG))
              {
                timer = osKernelGetTickCount();
                /* block until SDIO IP is ready or a timeout occur */
                ret = MSD_ERROR;
                while(osKernelGetTickCount() - timer < SD_TIMEOUT)
#endif
                {
                  ret = BSP_SD_GetCardState();
 
                  if (ret == MSD_OK)
                  {
                    break;
                  }
                }
 
                if (ret != MSD_OK)
                {
                  break;
                }
#if (osCMSIS < 0x20000U)
              }
            }
#else
          }
#endif
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
          /*
          *
          * invalidate the scratch buffer before the next read to get the actual data instead of the cached one
          */
          SCB_InvalidateDCache_by_Addr((uint32_t*)scratch, BLOCKSIZE);
#endif
          memcpy(buff, scratch, BLOCKSIZE);
          buff += BLOCKSIZE;
        }
        else
        {
          break;
        }
      }
 
      if ((i == count) && (ret == MSD_OK ))
        res = RES_OK;
    }
#endif
  return res;
}

I have added a cache clean prior to the BSP_SD_ReadBlocks_DMA call which resolved my connection issues:

#if defined(ENABLE_SCRATCH_BUFFER)
  if (!((uint32_t)buff & 0x3))
  {
#endif
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
    alignedAddr = (uint32_t)buff & ~0x1F;
    // Clean whole aligned buffer from data cache
    SCB_CleanDCache_by_Addr((uint32_t*)alignedAddr, count*BLOCKSIZE + ((uint32_t)buff - alignedAddr));
#endif
    /* Fast path cause destination buffer is correctly aligned */
    ret = BSP_SD_ReadBlocks_DMA((uint32_t*)buff, (uint32_t)(sector), count);
 
    if (ret == MSD_OK) {
#if (osCMSIS < 0x20000U)
    /* wait for a message from the queue or a timeout */
    event = osMessageGet(SDQueueID, SD_TIMEOUT);
 
    if (event.status == osEventMessage)
    {
      if (event.value.v == READ_CPLT_MSG)
      {
        timer = osKernelSysTick();
        /* block until SDIO IP is ready or a timeout occur */
        while(osKernelSysTick() - timer <SD_TIMEOUT)
#else

This is working for me, without this I can't use SD library with D-Cache enabled.

28 REPLIES 28
Piranha
Chief II

I don't know the details of SD related code, but just in case - here is a clean instruction of how D-cache should be managed:

https://community.st.com/s/question/0D53W00000oXSzySAG/different-cache-behavior-between-stm32h7-and-stm32f7

Problem is likely deeper.

The ByAddr functions expect 32-byte alignment of the buffer. Outside of that collateral damage will occur. DMA also needs 32-bit alignment.​

Tips, Buy me a coffee, or three.. PayPal Venmo
Up vote any posts that you find helpful, it shows what's working..
mantisrobot
Associate II

The buffer passed to SD_Read() *buff is indeed 32byte aligned using the following:

#define BUFFER_SIZE (1024*32)
 
ALIGN_32BYTES(uint8_t dmaBuffer[(BUFFER_SIZE+31U)&~(uint32_t)0x1F]);

But, without that additional SCB_CleanDCache_by_Addr() call prior to BSP_SD_ReadBlocks_DMA() I could not get the code to work. I did try the other option with ENABLE_SCRATCH_BUFFER. In this mode a scratch buffer is used presumably so that the calling buffer does not need to be 32byte aligned, but this did not work either.

These are the tow options described in the sd_diskio.c file:

/*
 * when using cacheable memory region, it may be needed to maintain the cache
 * validity. Enable the define below to activate a cache maintenance at each
 * read and write operation.
 * Notice: This is applicable only for cortex M7 based platform.
 */
/* USER CODE BEGIN enableSDDmaCacheMaintenance */
#define ENABLE_SD_DMA_CACHE_MAINTENANCE  1
/* USER CODE END enableSDDmaCacheMaintenance */
 
/*
* Some DMA requires 4-Byte aligned address buffer to correctly read/write data,
* in FatFs some accesses aren't thus we need a 4-byte aligned scratch buffer to correctly
* transfer data
*/
/* USER CODE BEGIN enableScratchBuffer */
//#define ENABLE_SCRATCH_BUFFER
/* USER CODE END enableScratchBuffer */

So here is my current solution without having to add code to sd_dickio.c which is in a non-user editable area and gets deleted on a ioc rebuild: reimplement the weak BSP_SD_ReadBlocks_DMA() function as follows and make sure the passed DMA buffer used for any f_read / f_mount etc is 32 byte aligned!

// My DMA buffer used for SD file IO
#define BUFFER_SIZE (1024*32)
ALIGN_32BYTES(uint8_t dmaBuffer[(BUFFER_SIZE+31U)&~(uint32_t)0x1F]);
 
 
 
// my re-implemneted BSP_SD_ReadBlocks_DMA() function
/**
  * @brief  Reads block(s) from a specified address in an SD card, in DMA mode.
  * @param  pData: Pointer to the buffer that will contain the data to transmit
  * @param  ReadAddr: Address from where data is to be read
  * @param  NumOfBlocks: Number of SD blocks to read
  * @retval SD status
  */
uint8_t BSP_SD_ReadBlocks_DMA(uint32_t *pData, uint32_t ReadAddr, uint32_t NumOfBlocks)
{
  uint8_t sd_state = MSD_OK;
 
  // **** ADDED ****
  // Clean whole aligned buffer from data cache
  SCB_CleanDCache_by_Addr((uint32_t*)pData, NumOfBlocks*BLOCKSIZE );
  // **** ADDED ****
 
  /* Read block(s) in DMA transfer mode */
  if (HAL_SD_ReadBlocks_DMA(&hsd1, (uint8_t *)pData, ReadAddr, NumOfBlocks) != HAL_OK)
  {
    sd_state = MSD_ERROR;
  }
 
  return sd_state;
}

SStor
Senior

I've pointed out the SD_read problem in thread:

[BUG] SD_read in sd_diskio.c fails with DMA in cached memory areas (st.com)

This occures expecially with odd filepointers (append mode) and unaligned file buffers allocated on cached heap.

There are also other SD card problems, e.g. no card detection on initialization sometimes.

It hasn't been solved by ST until now.

Simplest and safest way is using an intermediate buffer for one sector (scratch buffer) and place it in DTCM RAM. So you don't need to take care the whole cache maintenance stuff.

The same applies for all other peripheral DMA buffers on M7 core (ADC, UART, SPI, etc): Simply move these buffers to DTCM and use it like on M4 core with no cache.

Good to see nothing has been done about it 🙂

So yeah I was getting issues with DMA buffers on all my hardware but I resolved it using the cahce clean method. Do you have an example on how to move buffers to DTCM RAM? I'm fairly new to the STM32 processor.

You have to define the different SRAM memory regions in the linker file to use it specifically. Otherwise the whole SRAM is used in one block.

Linker definition depends from your IDE and controller type and has to be adapted for the used configuration.

I use IAR IDE and STM32F777. There are 512kB SRAM and the first 128kB are DTCM. Attached the corresponding linker file for this configuration as example.

In IAR the new defined DTCMRAM memory region can be used then for DMA buffer variables in compiler:

__ALIGN_BEGIN static uint8_t scratch[BLOCKSIZE] @ "DTCMRAM" __ALIGN_END;

It can help to find some hidden cache problems if you disable D cache with SCB_DisableDCache() at initialization first and look if the application behaviour changes. Trouble with cache is responsible for many problems with M7 cores.

Thanks for the tips/help.

Yes the DCache has caused me many head aches! I do turn it off when I have issues, its off right now!

I've spent the entire day trying to debug my SD problems but I haven't got anywhere, other than I can see what is happening. If my write buffer to f_write() call is larger than a sector (512 bytes) then I get corrupt data written to the SD card on each 512 byte boundary. see below:

This is a test string it should repeat 0000
 This is a test string it should repeat 0001
 This is a test string it should repeat 0002
 This is a test string it should repeat 0003
 This is a test stritring it should repeat 0004
 This is a test string it should repeat 0005
 This is a test string it should repeat 0006
 This is a test string it should repeat 0007
 This is a test string it should repeat 0008
 This is a test string it should repeat 0009
 This is a test string it should repeat 0010

From spending hours tracing through the code I think this is due to the DMA needing 32byte aligned data. So within sd_diskio.c file I have enabled the ENABLE_SCRATCH_BUFFER def.

First of all there is a bug in the SD_write() function when ENABLE_SCRATCH_BUFFER is enabled, which makes me wonder if it was ever tested! The code is supposed to check for 32byte alignment, and if its not aligned it runs a scratch register memcpy code, as per the SD_read(), however, the braces were in the wrong place, so the code simply skips to the end of the function if the buffer is not 32 byte aligned! Here is my corrected brace position code; marked with ///////////

DRESULT SD_write(BYTE lun, const BYTE *buff, DWORD sector, UINT count)
{
  DRESULT res = RES_ERROR;
  uint32_t timer;
 
#if (osCMSIS < 0x20000U)
  osEvent event;
#else
  uint16_t event;
  osStatus_t status;
#endif
 
#if defined(ENABLE_SCRATCH_BUFFER)
  int32_t ret;
#endif
 
  /*
  * ensure the SDCard is ready for a new operation
  */
 
  if (SD_CheckStatusWithTimeout(SD_TIMEOUT) < 0)
  {
    return res;
  }
 
#if defined(ENABLE_SCRATCH_BUFFER)
  if (!((uint32_t)buff & 0x3))
  {
#endif
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
  uint32_t alignedAddr;
  /*
    the SCB_CleanDCache_by_Addr() requires a 32-Byte aligned address
    adjust the address and the D-Cache size to clean accordingly.
  */
  alignedAddr = (uint32_t)buff & ~0x1F;
  SCB_CleanDCache_by_Addr((uint32_t*)alignedAddr, count*BLOCKSIZE + ((uint32_t)buff - alignedAddr));
#endif
 
  if(BSP_SD_WriteBlocks_DMA((uint32_t*)buff,
                           (uint32_t) (sector),
                           count) == MSD_OK)
  {
#if (osCMSIS < 0x20000U)
    /* Get the message from the queue */
    event = osMessageGet(SDQueueID, SD_TIMEOUT);
 
    if (event.status == osEventMessage)
    {
      if (event.value.v == WRITE_CPLT_MSG)
      {
#else
    status = osMessageQueueGet(SDQueueID, (void *)&event, NULL, SD_TIMEOUT);
    if ((status == osOK) && (event == WRITE_CPLT_MSG))
    {
#endif
 #if (osCMSIS < 0x20000U)
        timer = osKernelSysTick();
        /* block until SDIO IP is ready or a timeout occur */
        while(osKernelSysTick() - timer  < SD_TIMEOUT)
#else
        timer = osKernelGetTickCount();
        /* block until SDIO IP is ready or a timeout occur */
        while(osKernelGetTickCount() - timer  < SD_TIMEOUT)
#endif
        {
          if (BSP_SD_GetCardState() == SD_TRANSFER_OK)
          {
            res = RES_OK;
            break;
          }
        }
#if (osCMSIS < 0x20000U)
      }
    }
#else
    }
#endif
  }
#if defined(ENABLE_SCRATCH_BUFFER)
  /////////// END BRACE HERE /////////
  }
  /////////// END BRACE HERE /////////
  else {
    /* Slow path, fetch each sector a part and memcpy to destination buffer */
    int i;
 
#if (ENABLE_SD_DMA_CACHE_MAINTENANCE == 1)
    /*
     * invalidate the scratch buffer before the next write to get the actual data instead of the cached one
     */
     SCB_InvalidateDCache_by_Addr((uint32_t*)scratch, BLOCKSIZE);
#endif
      for (i = 0; i < count; i++)
      {
        memcpy((void *)scratch, buff, BLOCKSIZE);
        buff += BLOCKSIZE;
 
        ret = BSP_SD_WriteBlocks_DMA((uint32_t*)scratch, (uint32_t)sector++, 1);
        if (ret == MSD_OK )
        {
          /* wait until the read is successful or a timeout occurs */
#if (osCMSIS < 0x20000U)
          /* wait for a message from the queue or a timeout */
          event = osMessageGet(SDQueueID, SD_TIMEOUT);
 
          if (event.status == osEventMessage)
          {
            if (event.value.v == READ_CPLT_MSG)
            {
              timer = osKernelSysTick();
              /* block until SDIO IP is ready or a timeout occur */
              while(osKernelSysTick() - timer <SD_TIMEOUT)
#else
                status = osMessageQueueGet(SDQueueID, (void *)&event, NULL, SD_TIMEOUT);
              if ((status == osOK) && (event == READ_CPLT_MSG))
              {
                timer = osKernelGetTickCount();
                /* block until SDIO IP is ready or a timeout occur */
                ret = MSD_ERROR;
                while(osKernelGetTickCount() - timer < SD_TIMEOUT)
#endif
                {
                  ret = BSP_SD_GetCardState();
 
                  if (ret == MSD_OK)
                  {
                    break;
                  }
                }
 
                if (ret != MSD_OK)
                {
                  break;
                }
#if (osCMSIS < 0x20000U)
              }
            }
#else
          }
#endif
        }
        else
        {
          break;
        }
      }
 
      if ((i == count) && (ret == MSD_OK ))
        res = RES_OK;
    }
/////////// NO BRACE HERE /////////////
//  }
/////////// NO BRACE HERE /////////////
#endif
 
  return res;
}
 #endif /* _USE_WRITE == 1 */

Now, with this code fixed above, I can get the SD writes to work without corruption, however, it seems that its much slower and it breaks my system as the file is left in an open state before the next call to f_write(). )I have a double buffer system I was using with two 32K buffers filled from one thread and the file write in another thread)

correct data with SCRATCH buffer

This is a test string it should repeat 0000
 This is a test string it should repeat 0001
 This is a test string it should repeat 0002
 This is a test string it should repeat 0003
 This is a test string it should repeat 0004
 This is a test string it should repeat 0005
 This is a test string it should repeat 0006
 This is a test string it should repeat 0007
 This is a test string it should repeat 0008
 This is a test string it should repeat 0009
 This is a test string it should repeat 0010

A bit more debugging, when is SCRATCH mode the error I'm getting is here:

Thread #1 [main] 1 [core: 0] (Suspended : Breakpoint)	
	HAL_SD_WriteBlocks_DMA() at stm32f7xx_hal_sd.c:1,333 0x8013494	
	BSP_SD_WriteBlocks_DMA() at file.c:554 0x80070e0	
	SD_write() at sd_diskio.c:528 0x8019e2c	
	disk_write() at diskio.c:104 0x801b938	
	f_write() at ff.c:3,677 0x801dd44	
 

within the call to errorstate = SDMMC_CmdWriteSingleBlock(hsd->Instance, add);

returns the error: SDMMC_ERROR_CMD_RSP_TIMEOUT

Yes there were some things wrong in CubeMX generated sd_diskio.c.

I've replaced the complete module by my own sd_diskio_new.c (see attached).

Original module code has to be deactivated in user section of sd_diskio.c:

/* USER CODE BEGIN firstSection */
/* can be used to modify / undefine following code or add new definitions */
#define SD_DISKIO_NEW
#if !defined(SD_DISKIO_NEW)
/* USER CODE END firstSection*/
 
:
 
/* USER CODE BEGIN lastSection */
/* can be used to modify / undefine previous code or add new code */
#endif /* SD_DISKIO_NEW */
/* USER CODE END lastSection */

It is not perfect but it works for me for 2 years now.

Other problem with timeout error seems to be another issue (hardware)...