STM32H7A3 simultaneous SPI buses have delayed Transmit/Receive communication

MBles.1 · ‎2026-01-29

I have an STM32H7A3ZIT running at 280MHz with 4 SPI buses attempting to “simultaneously” communicate with external ADCs. All 4 SPI are set up as full-duplex masters, 16-bit data size, 8.75 MHz clock speed, and software chip select.

I’m using the HAL function HAL_SPI_TransmitReceive_IT to sequentially initiate the data transfer for each SPI bus. I expected each transfer to start quickly one after another, but instead I’m finding large time gaps. The TransmitReceive_IT composite image below shows the chip select line (yellow) and the SPI clock signals (cyan) for each sequential SPI bus. You can see the first 2 buses start clocking quickly, but then there is a time gap before the third and fourth buses start clocking. The code functionally works as the received data is correct, but the end of the transfer for the fourth SPI bus is too close to the next CS transaction, not leaving a lot of time for data processing.

TransmitReceive_IT

Oddly enough, if I change the function to just receiving (HAL_SPI_Receive_IT), then all 4 transactions proceed quickly as I would expect. See the following Receive_IT composite image demonstrating what I would like.

Receive_IT

Here's the code I'm using to start the SPI transmission.

#define ADCtxSize16	20
#define numRows		4

SPI_HandleTypeDef* hSPIrow[] = {&hspi2, &hspi4, &hspi1, &hspi5}; // ADC SPI handle array by rows for easier access

uint16_t bufADCtxData[numRows][ADCtxSize16] = {0};		// SPI data transmit buffer, same TX buffer for all rows, does not change
uint16_t bufADCrx[numRows][ADCtxSize16] = {0};			// SPI receive buffer, one for each row of ADCs

void Read_ADC_Data(void)
{
	// Start interrupt-based SPI transmission
	HAL_GPIO_WritePin(LED_GREEN_GPIO_Port, LED_GREEN_Pin, GPIO_PIN_SET);
	for (int i = 0; i < numRows; ++i)
	{
	    HAL_SPI_TransmitReceive_IT(hSPIrow[i], (uint8_t*)(bufADCtxData[i]), (uint8_t*)(bufADCrx[i]), ADCtxSize16);
	}
	HAL_GPIO_WritePin(LED_GREEN_GPIO_Port, LED_GREEN_Pin, GPIO_PIN_RESET);
}

Here is the HAL_SPI_TransmitReceive_IT code. I did find if I comment out the last bit of code in the function (SET_BIT(hspi->Instance->CR1, SPI_CR1_CSTART);) then the function returns very quickly. To me, this indicates the HAL overhead of setting up the SPI peripheral with rest of the function is not much; the delay is occurring once the SPI control register 1 master transfer start bit is enabled. This I do not understand.

/**
  * @brief  Transmit and Receive an amount of data in non-blocking mode with Interrupt.
  * @PAram  hspi   : pointer to a SPI_HandleTypeDef structure that contains
  *                  the configuration information for SPI module.
  * @PAram  pTxData: pointer to transmission data buffer
  * @PAram  pRxData: pointer to reception data buffer
  * @PAram  Size   : amount of data to be sent and received
  * @retval HAL status
  */
HAL_StatusTypeDef HAL_SPI_TransmitReceive_IT(SPI_HandleTypeDef *hspi, const uint8_t *pTxData, uint8_t *pRxData,
                                             uint16_t Size)
{
  uint32_t tmp_TxXferCount;
#if defined (__GNUC__)
  __IO uint16_t *ptxdr_16bits = (__IO uint16_t *)(&(hspi->Instance->TXDR));
#endif /* __GNUC__ */

  /* Check Direction parameter */
  assert_param(IS_SPI_DIRECTION_2LINES(hspi->Init.Direction));

  if (hspi->State != HAL_SPI_STATE_READY)
  {
    return HAL_BUSY;
  }

  if ((pTxData == NULL) || (pRxData == NULL) || (Size == 0UL))
  {
    return HAL_ERROR;
  }

  /* Lock the process */
  __HAL_LOCK(hspi);

  /* Set the transaction information */
  hspi->State       = HAL_SPI_STATE_BUSY_TX_RX;
  hspi->ErrorCode   = HAL_SPI_ERROR_NONE;
  hspi->pTxBuffPtr  = (const uint8_t *)pTxData;
  hspi->TxXferSize  = Size;
  hspi->TxXferCount = Size;
  hspi->pRxBuffPtr  = (uint8_t *)pRxData;
  hspi->RxXferSize  = Size;
  hspi->RxXferCount = Size;
  tmp_TxXferCount   = hspi->TxXferCount;

#if defined(USE_SPI_RELOAD_TRANSFER)
  hspi->Reload.Requested   = 0UL;
  hspi->Reload.pRxBuffPtr  = NULL;
  hspi->Reload.RxXferSize  = NULL;
  hspi->Reload.pTxBuffPtr  = NULL;
  hspi->Reload.TxXferSize  = NULL;
#endif /* USE_SPI_RELOAD_TRANSFER */

  /* Set the function for IT treatment */
  if (hspi->Init.DataSize > SPI_DATASIZE_16BIT)
  {
    hspi->TxISR     = SPI_TxISR_32BIT;
    hspi->RxISR     = SPI_RxISR_32BIT;
  }
  else if (hspi->Init.DataSize > SPI_DATASIZE_8BIT)
  {
    hspi->RxISR     = SPI_RxISR_16BIT;
    hspi->TxISR     = SPI_TxISR_16BIT;
  }
  else
  {
    hspi->RxISR     = SPI_RxISR_8BIT;
    hspi->TxISR     = SPI_TxISR_8BIT;
  }

  /* Set Full-Duplex mode */
  SPI_2LINES(hspi);

  /* Set the number of data at current transfer */
  MODIFY_REG(hspi->Instance->CR2, SPI_CR2_TSIZE, Size);

  /* Enable SPI peripheral */
  __HAL_SPI_ENABLE(hspi);

  /* Fill in the TxFIFO */
  while ((__HAL_SPI_GET_FLAG(hspi, SPI_FLAG_TXP)) && (tmp_TxXferCount != 0UL))
  {
    /* Transmit data in 32 Bit mode */
    if (hspi->Init.DataSize > SPI_DATASIZE_16BIT)
    {
      *((__IO uint32_t *)&hspi->Instance->TXDR) = *((const uint32_t *)hspi->pTxBuffPtr);
      hspi->pTxBuffPtr += sizeof(uint32_t);
      hspi->TxXferCount--;
      tmp_TxXferCount = hspi->TxXferCount;
    }
    /* Transmit data in 16 Bit mode */
    else if (hspi->Init.DataSize > SPI_DATASIZE_8BIT)
    {
#if defined (__GNUC__)
      *ptxdr_16bits = *((const uint16_t *)hspi->pTxBuffPtr);
#else
      *((__IO uint16_t *)&hspi->Instance->TXDR) = *((const uint16_t *)hspi->pTxBuffPtr);
#endif /* __GNUC__ */
      hspi->pTxBuffPtr += sizeof(uint16_t);
      hspi->TxXferCount--;
      tmp_TxXferCount = hspi->TxXferCount;
    }
    /* Transmit data in 8 Bit mode */
    else
    {
      *((__IO uint8_t *)&hspi->Instance->TXDR) = *((const uint8_t *)hspi->pTxBuffPtr);
      hspi->pTxBuffPtr += sizeof(uint8_t);
      hspi->TxXferCount--;
      tmp_TxXferCount = hspi->TxXferCount;
    }
  }

  /* Unlock the process */
  __HAL_UNLOCK(hspi);

  /* Enable EOT, DXP, UDR, OVR, FRE, MODF and TSERF interrupts */
  __HAL_SPI_ENABLE_IT(hspi, (SPI_IT_EOT | SPI_IT_DXP | SPI_IT_UDR | SPI_IT_OVR |
                             SPI_IT_FRE | SPI_IT_MODF | SPI_IT_TSERF));

  if (hspi->Init.Mode == SPI_MODE_MASTER)
  {
    /* Start Master transfer */
    SET_BIT(hspi->Instance->CR1, SPI_CR1_CSTART);
  }

  return HAL_OK;
}

Lastly, the SPI_Init code.

hspi1.Instance = SPI1;
hspi1.Init.Mode = SPI_MODE_MASTER;
hspi1.Init.Direction = SPI_DIRECTION_2LINES;
hspi1.Init.DataSize = SPI_DATASIZE_16BIT;
hspi1.Init.CLKPolarity = SPI_POLARITY_LOW;
hspi1.Init.CLKPhase = SPI_PHASE_1EDGE;
hspi1.Init.NSS = SPI_NSS_SOFT;
hspi1.Init.BaudRatePrescaler = SPI_BAUDRATEPRESCALER_32;
hspi1.Init.FirstBit = SPI_FIRSTBIT_MSB;
hspi1.Init.TIMode = SPI_TIMODE_DISABLE;
hspi1.Init.CRCCalculation = SPI_CRCCALCULATION_DISABLE;
hspi1.Init.CRCPolynomial = 0x0;
hspi1.Init.NSSPMode = SPI_NSS_PULSE_ENABLE;
hspi1.Init.NSSPolarity = SPI_NSS_POLARITY_LOW;
hspi1.Init.FifoThreshold = SPI_FIFO_THRESHOLD_01DATA;
hspi1.Init.TxCRCInitializationPattern = SPI_CRC_INITIALIZATION_ALL_ZERO_PATTERN;
hspi1.Init.RxCRCInitializationPattern = SPI_CRC_INITIALIZATION_ALL_ZERO_PATTERN;
hspi1.Init.MasterSSIdleness = SPI_MASTER_SS_IDLENESS_00CYCLE;
hspi1.Init.MasterInterDataIdleness = SPI_MASTER_INTERDATA_IDLENESS_00CYCLE;
hspi1.Init.MasterReceiverAutoSusp = SPI_MASTER_RX_AUTOSUSP_DISABLE;
hspi1.Init.MasterKeepIOState = SPI_MASTER_KEEP_IO_STATE_DISABLE;
hspi1.Init.IOSwap = SPI_IO_SWAP_DISABLE;

Thank you for any help you can provide! I'm at a loss as to why adding transmission to the SPI transaction would cause the delays over just receiving.

I'm open to alternative methods as well; I just thought this way would be rather straight forward. The chip select signal (yellow) is actually a timer outputting PWM. At the chip select rising edge, all ADCs sample their input so the data is synchronized. I use the timer's HAL_TIM_PWM_PulseFinishedCallback interrupt to know when the chip select when low to start the SPI transactions.

TDK · ‎2026-01-29

There's nothing wrong with the signals shown as far as the SPI protocol goes. You can have gaps. If you don't want gaps, consider using DMA to send or using more optimized code. Compiling in Release configuration will be faster.

The issue is that interrupts take time to run and HAL_SPI_Receive_IT calls an interrupt for each received word.

Only receiving requires less CPU time than sending and receiving.

You can have the STM32H7 handle the CS line as well to avoid having to do it with a TIM.

Commenting out SPI_CR1_CSTART will prevent the transfer from starting. Not sure I understand the point.

If you feel a post has answered your question, please click "Accept as Solution".

Chris21 · ‎2026-01-29

Perhaps the delay is due to having to service all those interrupts from the 1st 2 SPI transfers. There should be less code executed in the handlers for the Receive only case.

(Sorry, I see now that TDK already said the same thing).

MBles.1 · ‎2026-01-29

@TDK wrote:
There's nothing wrong with the signals shown as far as the SPI protocol goes. You can have gaps. If you don't want gaps, consider using DMA to send or using more optimized code. Compiling in Release configuration will be faster.
The issue is that interrupts take time to run and HAL_SPI_Receive_IT calls an interrupt for each received word.

Only receiving requires less CPU time than sending and receiving.

Are you saying all the receive interrupts are causing it to slow down? If so why is receiving alone running great but transmit and receiving is bottle necking?

@TDK wrote:
You can have the STM32H7 handle the CS line as well to avoid having to do it with a TIM.

I have to run it with a timer for fixed-interval sampling. It also has to do with the ADC sampling time.

@TDK wrote:
Commenting out SPI_CR1_CSTART will prevent the transfer from starting. Not sure I understand the point.

The point was to test the HAL efficiency. It's commonly accepted that HAL code is bloated, so by running everything except for the very last line (SPI_CR1_CSTART), I could see that the HAL function runs quickly. It's starting/running the transmission that seems to be the problem.

MBles.1 · ‎2026-01-29

@Chris21 wrote:
Perhaps the delay is due to having to service all those interrupts from the 1st 2 SPI transfers. There should be less code executed in the handlers for the Receive only case.
(Sorry, I see now that TDK already said the same thing).

As far as I can tell, adding transmission only adds the TxISR to the SPI handle. In the end it either calls the HAL_SPI_TxRxCpltCallback or HAL_SPI_RxCpltCallback.

Chris21 · ‎2026-01-29

static void SPI_RxISR_16BIT(SPI_HandleTypeDef *hspi)
{
  /* Receive data in 16 Bit mode */
#if defined (__GNUC__)
  __IO uint16_t *prxdr_16bits = (__IO uint16_t *)(&(hspi->Instance->RXDR));

  *((uint16_t *)hspi->pRxBuffPtr) = *prxdr_16bits;
#else
  *((uint16_t *)hspi->pRxBuffPtr) = (*(__IO uint16_t *)&hspi->Instance->RXDR);
#endif /* __GNUC__ */
  hspi->pRxBuffPtr += sizeof(uint16_t);
  hspi->RxXferCount--;

  /* Disable IT if no more data excepted */
  if (hspi->RxXferCount == 0UL)
  {
    /* Disable RXP interrupts */
    __HAL_SPI_DISABLE_IT(hspi, SPI_IT_RXP);
  }
}

static void SPI_TxISR_16BIT(SPI_HandleTypeDef *hspi)
{
  /* Transmit data in 16 Bit mode */
#if defined (__GNUC__)
  __IO uint16_t *ptxdr_16bits = (__IO uint16_t *)(&(hspi->Instance->TXDR));

  *ptxdr_16bits = *((const uint16_t *)hspi->pTxBuffPtr);
#else
  *((__IO uint16_t *)&hspi->Instance->TXDR) = *((const uint16_t *)hspi->pTxBuffPtr);
#endif /* __GNUC__ */
  hspi->pTxBuffPtr += sizeof(uint16_t);
  hspi->TxXferCount--;

  /* Disable IT if no more data excepted */
  if (hspi->TxXferCount == 0UL)
  {
    /* Disable TXP interrupts */
    __HAL_SPI_DISABLE_IT(hspi, SPI_IT_TXP);
  }
}

If you call both ISR functions it's about double the code...