cancel
Showing results for 
Search instead for 
Did you mean: 

Why does my Tx-only software 3 MBaud UART sometimes send strange characters?

arnold_w
Senior II

I am working with the STM32F769 microcontroller and it's using FreeRTOS operating system. It uses the following clock configuration (216 MHz SystemCoreClock):

/** System Clock Configuration
*/
void SystemClock_Config(void)
{
 
  RCC_OscInitTypeDef RCC_OscInitStruct = {0};
  RCC_ClkInitTypeDef RCC_ClkInitStruct = {0};
  RCC_PeriphCLKInitTypeDef PeriphClkInitStruct = {0};
 
    /**Configure the main internal regulator output voltage
    */
  __HAL_RCC_PWR_CLK_ENABLE();
 
  __HAL_PWR_VOLTAGESCALING_CONFIG(PWR_REGULATOR_VOLTAGE_SCALE3);
 
    /**Initializes the CPU, AHB and APB busses clocks
    */
  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_LSI|RCC_OSCILLATORTYPE_HSE;
  RCC_OscInitStruct.HSEState = RCC_HSE_ON;
  RCC_OscInitStruct.LSIState = RCC_LSI_ON;
  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
  RCC_OscInitStruct.PLL.PLLM = 25;
  RCC_OscInitStruct.PLL.PLLN = 432;
  RCC_OscInitStruct.PLL.PLLQ = 9;
  RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;
 
  RCC_OscInitStruct.PLL.PLLR = 2;  /* Even when DSI is disabled PLLR with 2 <= PLLR <= 7 according to RM0410 p.163 */
  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK)
  {
    Error_Handler();
  }
  /** Activate the Over-Drive mode
  */
  if (HAL_PWREx_EnableOverDrive() != HAL_OK)
  {
    Error_Handler();
  }
 
 
    /**Initializes the CPU, AHB and APB busses clocks
    */
  RCC_ClkInitStruct.ClockType = RCC_CLOCKTYPE_HCLK|RCC_CLOCKTYPE_SYSCLK
                              |RCC_CLOCKTYPE_PCLK1|RCC_CLOCKTYPE_PCLK2;
  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;
  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;
  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;
  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;
 
  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_7) != HAL_OK)
  {
    Error_Handler();
  }

I have written a software transmit-only UART running at 3 MBaud on any arbitrary GPIO pin. For the most part it works great, but every once in a while, especially in interrupts, it starts "writing in Chinese" and strange characters appear in my terminal window. Does anybody know what's wrong?

#define IDLE_STATE                      1
#define DISABLE_ALL_INTS_IF_NECESSARY()             uint32_t old_primask;                 \
                                                    old_primask = __get_PRIMASK();        \
                                                    __disable_irq()
 
#define ENABLE_ALL_INTS_IF_THEY_WERE_ENABLED()      if (!old_primask)                     \
                                                    {                                     \
                                                        __enable_irq();                   \
                                                    }
static GPIO_TypeDef* TxPort_;
static uint16_t TxPin_;
 
void LL_UART_SW_TxOnly_enable(GPIO_TypeDef* TxPort, uint16_t TxPin) {
    TxPort_ = TxPort;
    TxPin_ = TxPin;
    GPIO_InitTypeDef initStruct = {TxPin_, GPIO_MODE_OUTPUT_PP, GPIO_NOPULL, GPIO_SPEED_LOW, 0};
    HAL_GPIO_Init(TxPort_, &initStruct);
    SET_PIN(TxPort_, TxPin_, IDLE_STATE);
}
 
void LL_UART_SW_TxOnly_disable(void) {
    GPIO_InitTypeDef initStruct = {TxPin_, GPIO_MODE_ANALOG, GPIO_NOPULL, GPIO_SPEED_LOW, 0};
    HAL_GPIO_Init(TxPort_, &initStruct);
}
 
                                             /* 216 MHz system clock */
#define OUTPUT_BIT(__BIT_NO__)               *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];
 
inline static void __attribute__((optimize("O0"))) outputByte(volatile uint32_t* BSRR, uint32_t BSRRvalues[10]) {
    OUTPUT_BIT(0);
    OUTPUT_BIT(1);
    OUTPUT_BIT(2);
    OUTPUT_BIT(3);
    OUTPUT_BIT(4);
    OUTPUT_BIT(5);
    OUTPUT_BIT(6);
    OUTPUT_BIT(7);
    OUTPUT_BIT(8);
    *BSRR = BSRRvalues[9];
}
 
// Superfast (3 MBit/s) UART
void __attribute__((optimize("O3"))) LL_UART_SW_TxOnly_transmitByte(uint8_t byteToSend) {
    uint32_t BSRRvalues[10];
    BSRRvalues[0]  = (IDLE_STATE == 1) ? (((uint32_t)TxPin_) << 16) : TxPin_;  // Start bit
    BSRRvalues[9]  = (IDLE_STATE == 0) ? (((uint32_t)TxPin_) << 16) : TxPin_;  // Stop bit
    BSRRvalues[1]  = BSRRvalues[(byteToSend & 0x01) ? 9 : 0];                  // Bit 0
    BSRRvalues[2]  = BSRRvalues[(byteToSend & 0x02) ? 9 : 0];                  // Bit 1
    BSRRvalues[3]  = BSRRvalues[(byteToSend & 0x04) ? 9 : 0];                  // Bit 2
    BSRRvalues[4]  = BSRRvalues[(byteToSend & 0x08) ? 9 : 0];                  // Bit 3
    BSRRvalues[5]  = BSRRvalues[(byteToSend & 0x10) ? 9 : 0];                  // Bit 4
    BSRRvalues[6]  = BSRRvalues[(byteToSend & 0x20) ? 9 : 0];                  // Bit 5
    BSRRvalues[7]  = BSRRvalues[(byteToSend & 0x40) ? 9 : 0];                  // Bit 6
    BSRRvalues[8]  = BSRRvalues[(byteToSend & 0x80) ? 9 : 0];                  // Bit 7
    DISABLE_ALL_INTS_IF_NECESSARY();
    outputByte(&TxPort_->BSRR, BSRRvalues);
    ENABLE_ALL_INTS_IF_THEY_WERE_ENABLED();
}
 
 
void LL_UART_SW_TxOnly_transmitBuffer(uint8_t* pData, uint16_t numBytes) {
    while (0 < numBytes--) {
        LL_UART_SW_TxOnly_transmitByte(*pData++);
    }
}
 
void LL_UART_SW_TxOnly_transmitNullTermString(const char* pData) {
    while (*pData != 0) {
        LL_UART_SW_TxOnly_transmitByte(*pData++);
    }
}

21 REPLIES 21

This is not your friendly 8-bitter with predictible instruction timing.

I'd start with rewriting this into inline/separate asm, then I'd continue with investigating execution jitter when running it from different memories (perhaps TCM RAM would be the best candidate?)

I'd avoid the multiple writes to the register, it may unnecessarily collide with other busmasters (DMA).

And I'd never, never use this in anything else but a debug printout.

JW

Yes, this is for debugging (I'm not a big fan of trace). The code works great in my STM32F405 and STM32F446 projects, that don't have any RTOS, even when called from interrupt handlers. Are there that big differences between the STM32F769 and the STM32F4 families that the microcontroller family (and thus, architecture) makes all the difference? Can't the RTOS create these problems? I'm calling __disable_irq() before outputting each byte, but I don't know if the RTOS can still perform context switching?

> I'm calling __disable_irq() before outputting each byte,

> but I don't know if the RTOS can still perform context switching?

No. RTOS is no magic, it simply uses a timer interrupt.

The Cortex-M7 as compared to Cortex-M4, is superscalar and features speculative elements like branch prediction, which increase execution jitter and decrease use control over timing. Plus the bus fabric is way more complex, adding further to the uncertainties.

JW

> I'm calling __disable_irq() before outputting each byte,

> but I don't know if the RTOS can still perform context switching?

No. RTOS is no magic, it simply uses a timer interrupt.

The Cortex-M7 as compared to Cortex-M4, is superscalar and features speculative elements like branch prediction, which increase execution jitter and decrease use control over timing. Plus the bus fabric is way more complex, adding further to the uncertainties.

JW

Piranha
Chief II

For 216 MHz the PWR regulator voltage scaling must be set to scale 1 or not changed at all, because scale 1 is the reset value.

arnold_w
Senior II

By adding the word __RAM_FUNC to the outputByte and LL_UART_SW_TxOnly_transmitByte functions, I was able to make it a lot more reliable:

inline static void __attribute__((optimize("O0"))) __RAM_FUNC outputByte(volatile uint32_t* BSRR, uint32_t BSRRvalues[10]) {
 .
.
.
}
 
void __attribute__((optimize("O3"))) __RAM_FUNC LL_UART_SW_TxOnly_transmitByte(uint8_t byteToSend) {
.
.
.
}

However, when I tried to replace

#define OUTPUT_BIT_(__BIT_NO__)              *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             *BSRR = BSRRvalues[__BIT_NO__];   \
                                             .
                                             .
                                             .

with

#define OUTPUT_BIT(__BIT_NO__)               *BSRR = BSRRvalues[__BIT_NO__];   \
                                             asm("NOP");                       \
                                             asm("NOP");                       \
                                             asm("NOP");                       \
                                             .
                                             .
                                             .

then, to my surprise, it became unreliable again. I really couldn't make any sense anymore of what I saw on the oscilloscope when I measured the bit periods. Does anybody know why and does anybody know what I can replace the dangerous *BSRR = BSRRvalues[__BIT_NO__] assignments with instead?

Check the generated assembly code. How does that look like?

CortexM7 is a dual dual issue. In one clock step. *BSSR and one __NOP() can happen. Your second code seem to take this into account. But BSSR trickles through the bus maxtix with several clocks, while nop has no interaction with the bus matrix.

arnold_w
Senior II

> I'd avoid the multiple writes to the register, it may unnecessarily collide with other busmasters (DMA).

> But BSSR trickles through the bus maxtix with several clocks, while nop has no interaction with the bus matrix.

I assume polling a timer interrupt flag would create just as much traffic on bus/matrix? But what if I would poll the DWT Cycle Counter, would that be a better choice?