cancel
Showing results for 
Search instead for 
Did you mean: 

Help needed for tuning 64Mb HYPERRAM (S27KL0642DPBHI020) on STM32H735IGT6

unsigned_char_array
Senior III

I need help with configuring HYPERRAM for my STM32 on a custom PCB. The HYPERRAM pins are about 32mm away from the MCU pins.
The HYPERRAM works, but I get strange flickering on the display even with static data in RAM which I think comes memory errors of the external RAM. My memory passes some basic memory tests, but I think things go wrong when the LTDC peripheral reads from the memory at high speeds. I could be wrong about this, but I need to tune the configuration of the memory anyway. I want to exclude this possibility.

There are so many settings and contradictory advice that I'm overwhelmed.

Turning on D-cache for the HYPERRAM doesn't help.

If I calibrate the delay block the memory tests fail. If I don't calibrate it, it doesn't fail. If I disable it it also fails.
I've implemented calibration according to the following forum post: https://community.st.com/t5/stm32-mcus-products/stm32h7-octospi-mode-hyperbus-hyperram-access-and-delay-block/m-p/143244/highlight/true#M27659

I can read the registers from the memory chip (ID0, ID1, CR0, CR1) and the values match the default values from the datasheet. But I'm not able to successfully write to a register. I want to set the output driver impedance to 46 Ohmin CR0 (instead of the default 34 Ohm) since our traces have an impedance of 50 Ohm. I don't know if this is going to improve the situation, but it would be good to know how I can set this value.

RAM is connected to OSPI2. Pins are configured with highest drive strength since I want to run the memory at 200MHz eventually. I'm now running it lower to make testing with a logic analyzer easier.

For now I clock the OSPI2 peripheral at 332MHz (333 is the max frequency for peripheral).
This allows me to use a prescaler of 2 to get 166MHz or a prescaler of 4 to get 83MHz. These are the frequencies I want to get working first (with delay block calibration). You need at least a prescaler of 2 (register value of 1) to enable DHQC so I won't be able to run the OSPI at 200MHz with DHQC.

Here is my configuration:

This configuration works, but not if I calibrate the delay block or try to write to CR0 (these blocks have been defined off).

 

 

/* OCTOSPI2 init function */
void MX_OCTOSPI2_Init(void)
{

  /* USER CODE BEGIN OCTOSPI2_Init 0 */
	HAL_Delay(1); // 1-2ms delay after power on
	HAL_GPIO_WritePin(MCU_HYPERBUS_NRESET_GPIO_Port, MCU_HYPERBUS_NRESET_Pin, 1); //de-assert reset
	HAL_Delay(1); // 1-2ms delay after reset
  /* USER CODE END OCTOSPI2_Init 0 */

  OSPIM_CfgTypeDef sOspiManagerCfg = {0};
  OSPI_HyperbusCfgTypeDef sHyperBusCfg = {0};

  /* USER CODE BEGIN OCTOSPI2_Init 1 */

  // calibrate delay block:
#if 0
  //same config as below
  hospi2.Instance = OCTOSPI2;
  hospi2.Init.FifoThreshold = 1;
  hospi2.Init.DualQuad = HAL_OSPI_DUALQUAD_DISABLE;
  hospi2.Init.MemoryType = HAL_OSPI_MEMTYPE_HYPERBUS;
  hospi2.Init.DeviceSize = 23;
  hospi2.Init.ChipSelectHighTime = 1;
  hospi2.Init.FreeRunningClock = HAL_OSPI_FREERUNCLK_DISABLE;
  hospi2.Init.ClockMode = HAL_OSPI_CLOCK_MODE_0;
  hospi2.Init.WrapSize = HAL_OSPI_WRAP_NOT_SUPPORTED;
  hospi2.Init.ClockPrescaler = 4;
  hospi2.Init.SampleShifting = HAL_OSPI_SAMPLE_SHIFTING_NONE;
  hospi2.Init.DelayHoldQuarterCycle = HAL_OSPI_DHQC_ENABLE;
  hospi2.Init.ChipSelectBoundary = 0;
  hospi2.Init.DelayBlockBypass = HAL_OSPI_DELAY_BLOCK_USED;
  hospi2.Init.MaxTran = 0;
  hospi2.Init.Refresh = 83;

  // override clock setting:
  hospi2.Init.FreeRunningClock = HAL_OSPI_FREERUNCLK_ENABLE;
  hospi2.Init.Refresh = 0;

  if (HAL_OSPI_Init(&hospi2) != HAL_OK)
  {
	  Error_Handler() ;
  }

  if (DelayBlock_Enable(DLYB_OCTOSPI2) != HAL_OK)
  {
	  Error_Handler();
  }

  HAL_OSPI_DeInit(&hospi2);
#endif


  /* USER CODE END OCTOSPI2_Init 1 */
  hospi2.Instance = OCTOSPI2;
  hospi2.Init.FifoThreshold = 1;
  hospi2.Init.DualQuad = HAL_OSPI_DUALQUAD_DISABLE;
  hospi2.Init.MemoryType = HAL_OSPI_MEMTYPE_HYPERBUS;
  hospi2.Init.DeviceSize = 23;
  hospi2.Init.ChipSelectHighTime = 1;
  hospi2.Init.FreeRunningClock = HAL_OSPI_FREERUNCLK_DISABLE;
  hospi2.Init.ClockMode = HAL_OSPI_CLOCK_MODE_0;
  hospi2.Init.WrapSize = HAL_OSPI_WRAP_NOT_SUPPORTED;
  hospi2.Init.ClockPrescaler = 4;
  hospi2.Init.SampleShifting = HAL_OSPI_SAMPLE_SHIFTING_NONE;
  hospi2.Init.DelayHoldQuarterCycle = HAL_OSPI_DHQC_ENABLE;
  hospi2.Init.ChipSelectBoundary = 0;
  hospi2.Init.DelayBlockBypass = HAL_OSPI_DELAY_BLOCK_USED;
  hospi2.Init.MaxTran = 0;
  hospi2.Init.Refresh = 83;
  if (HAL_OSPI_Init(&hospi2) != HAL_OK)
  {
    Error_Handler();
  }
  sOspiManagerCfg.ClkPort = 2;
  sOspiManagerCfg.DQSPort = 2;
  sOspiManagerCfg.NCSPort = 2;
  sOspiManagerCfg.IOLowPort = HAL_OSPIM_IOPORT_2_LOW;
  sOspiManagerCfg.IOHighPort = HAL_OSPIM_IOPORT_2_HIGH;
  if (HAL_OSPIM_Config(&hospi2, &sOspiManagerCfg, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  sHyperBusCfg.RWRecoveryTime = 7;
  sHyperBusCfg.AccessTime = 7;
  sHyperBusCfg.WriteZeroLatency = HAL_OSPI_LATENCY_ON_WRITE;
  sHyperBusCfg.LatencyMode = HAL_OSPI_FIXED_LATENCY;
  if (HAL_OSPI_HyperbusCfg(&hospi2, &sHyperBusCfg, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  /* USER CODE BEGIN OCTOSPI2_Init 2 */

  OSPI_HyperbusCmdTypeDef sCommand = {0};
  OSPI_MemoryMappedTypeDef sMemMappedCfg = {0};
//
  volatile HAL_StatusTypeDef status;


  volatile uint16_t ID0=0;
  volatile uint16_t ID1=0;
  volatile uint16_t CR0=0;
  volatile uint16_t CR1=0;


  sCommand.AddressSpace = HAL_OSPI_REGISTER_ADDRESS_SPACE;
  sCommand.AddressSize  = HAL_OSPI_ADDRESS_32_BITS; // HAL_OSPI_ADDRESS_32_BITS
  sCommand.DQSMode      = HAL_OSPI_DQS_ENABLE;

#if 0
 ///write to CR0
  volatile uint16_t CR0_new = 0x8f2f | (3<<12);
  sCommand.Address      = 0x800*2; //CR0
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Transmit(&hospi2, (uint8_t*)&CR0_new, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);
#endif

  // read registers
  sCommand.Address      = 0*2; //ID0
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Receive(&hospi2, (uint8_t*)&ID0, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);

  sCommand.Address      = 1*2; //ID1
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Receive(&hospi2, (uint8_t*)&ID1, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);


  sCommand.Address      = 0x800*2; //CR0
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Receive(&hospi2, (uint8_t*)&CR0, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);

  sCommand.Address      = 0x801*2; //CR1
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Receive(&hospi2, (uint8_t*)&CR1, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);



  sCommand.AddressSpace = HAL_OSPI_MEMORY_ADDRESS_SPACE;
  sCommand.AddressSize  = HAL_OSPI_ADDRESS_24_BITS;
  sCommand.DQSMode      = HAL_OSPI_DQS_ENABLE;
  sCommand.Address      = 0;
  sCommand.NbData       = 1;

  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }


  sMemMappedCfg.TimeOutActivation = HAL_OSPI_TIMEOUT_COUNTER_DISABLE;

  if (HAL_OSPI_MemoryMapped(&hospi2, &sMemMappedCfg) != HAL_OK)
  {
    Error_Handler();
  }


  //memory tests
  externalRamValid =  memoryTests16BitTestAll((void*)EXTERNAL_RAM_START_ADDRESS, EXTERNAL_RAM_SIZE_BYTES, hospi2.Init.DeviceSize-1, 9, &memoryTestResult);

  if (!externalRamValid)
  {
	  Error_Handler();
  }

  /* USER CODE END OCTOSPI2_Init 2 */

}

 

 

 

 

 

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.
1 ACCEPTED SOLUTION

Accepted Solutions
unsigned_char_array
Senior III

(I still haven't got the LYB_OSPI_NOR_FastTuning/OSPI_PSRAM_MemoryMapped algorithm working. But I'll park that for now. If anyone knows how to get that working let me know.)

Here is the summary of what I needed to do to get everything to work for the s27kl0642dpbhi020 HyperRAM with the STM32H735IG with a 1024x600 display:

  • On DS13312 Rev 4, page 194 it says the max OSPI output clock frequency is 100MHz
  • The max OSPI peripheral clock frequency is 333 MHz(according to STM32CubeMX clock config), and you need a divider of at least 2 to get Delay hold quarter cycle working(RM0468 Rev 3, page 917), so I run it at 200MHz with divider 2.
  • The HyperRAM is by default configured to have an output drive strength of 34 ohm, but it can be set to 46 ohm, this can be configured in CR0. I've set this to 46 ohm since my traces have an impedance of 50 ohm.
  • The HyperRAM is by default configured to have an Initial latency of 7 clock cycles, but below 104MHz it can be set to 4 clock cycles in CR0. I've set this lower to increase performance.
  • In order to write to a configuration register of the RAM you need to set sHyperBusCfg.WriteZeroLatency = HAL_OSPI_NO_LATENCY_ON_WRITE
  • In order to write to a configuration register of the RAM you need to set sCommand.DQSMode = HAL_OSPI_DQS_DISABLE
  • Register address from the HyperRAM datasheet has to be multiplied by 2, NbData needs to be 2 bytes.
  • To get higher performance you need to enable D-cache and also enable the MPU and enable cache for external RAM (you can find an example of this configuration in TouchGFX examples for stm32h735g-dk)
  • If you enable D-cache you need to clear it prior to the DMA2D accessing it. You need to modify TouchGFXHAL::flushFrameBuffer to call this. Use the_by_Addr variants to only clean the framebuffer in use and not the complete cache. In my case either SCB_CleanDCache_by_Addr or SCB_InvalidateDCache_by_Addr work. I'm not sure which one is needed or if I need SCB_CleanInvalidateDCache_by_Addr. I no longer see artifacts if I enable it.
  • You can read CR1 to check the refresh interval. If it is higher it means you don't have to refresh as often, this can slightly increase performance.
  • I tuned the LTDC porches. I reduced horizontal back and front porches and reduced vertical back porch and increased vertical front porch. This way lockDMAToFrontPorch can be used and it will have more time before the LTDC starts accessing the memory. I'm not using lockDMAToFrontPorch at the moment.
  • I calibrate the delay block. I base it on the example DLYB_OSPI_PSRAM_ExhaustiveTuning. This simply reinitializes the delay block unit value and runs memory tests to see if it works. It applies the middle value which is between the two extremes that work. I realize this may not work if you need more than 1 delay block (sel >1) as delay is not proportional to unit or the product of unit and sel. I did not get other calibration methods working ( I get way to high values). I checked the typical delay values in the datasheet to verify if the settings make sense: DS13312 Rev 4, page 196.
  • If you are using Ethernet do not set all of the SRAM to shared. This will slow down the performance of the memory and lead to slow rendering. Ethernet DMA requires its descriptors to be in a part of SRAM that is marked as shared by the MPU.
  • In RM0468 Rev 3, page 943 it says refresh for reads occurs every refresh+4 cycles, so I set the refresh value to 100-4 or 400-4 for respectively a 1us or 4us refresh period.
  • Forcebly reset the OSPI peripheral in OCTOSPI2_MspInit using __HAL_RCC_OSPI2_FORCE_RESET() and __HAL_RCC_OSPI2_RELEASE_RESET(). Otherwise register values will not be correct after deinit + re-init.
  • Do memory performance tests. I got about 101MByte/s write speed and 158MByte/s read speed and 47MByte/s for read-modify-write. With incorrect configurations you can get less than half of that. It might still work, but you get less performance. You will never get 100% of the theoretical bandwidth of 200MByte/s because of various overheads.

 

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.

View solution in original post

10 REPLIES 10
KDJEM.1
ST Employee

Hello @unsigned_char_array ,

The max OCTOSPI clock frequency for hyperbus is 100 MHz for that I advise you to refer to STM32H735 datasheet and check the max OCTOSPI frequency.

Please try to decrease the OCTOSPI clock frequency.

KDJEM1_0-1709803958737.png

(4) Note that when using PC2 or PC3 I/O on data bus decreases the frequency to 47 MHz.

For the delay block calibration issue I advise you to refer to AN5050  precisely OCTOSPI application examples section 6 ( IV. HyperFlash and HyperRAM memories with Multiplexed mode example ) may help you.

I hope this help you!

Thank you.

Kaouthar

 

To give better visibility on the answered topics, please click on Accept as Solution on the reply which solved your issue or answered your question.

Thank you. I somehow missed the 100MHz limit. This really sucks. But I'll just have to live with this limit. Luckily we don't use PC2 or PC3. I've I set it to 100MHz.

I still haven't got the delay block calibration working in the RAM. It works without this calibration, but I don't want anything to work on the prototypes that will fail in the field, so I don't want to run anything out of spec.

Also I still don't know how to write to the CR0 register of the RAM. At 100MHz I can reduce the latency setting from 7  to 4 clock cycles so that might help with increasing performance. The datasheet mentions the write command has 0x60 in "System address" bits 27-31. However this is mathematically impossible. Can someone tell me how I can use STM32 HAL to write to a HYPERRAM register?

unsigned_char_array_0-1709894160787.png

I made some progress in finding the cause of the display flickering.
In single buffer mode I have diagonal tearing and in double buffering mode I have only horizontal tearing when I'm rendering. When I break the debugger the image looks good. And if I edit memory values with the debugger, then I see some tearing. When I enable LTDC error interrupts I see that I get LTDC fifo underrun errors. This clearly is a memory bandwidth issue.

I've tried a few configurations:

  • DMA2D enabled, double buffer, 20 prescaler: 19.20MHz, extreme tearing in bottom half of display, top half doesn't even render, LTDC error interrupt fifo underrun
  • DMA2D enabled, double buffer, 25 prescaler: 15.36MHz, extreme tearing, LTDC error interrupt fifo underrun
  • DMA2D enabled, double buffer, 28 prescaler: 13.71MHz, shaking tearing, LTDC error interrupt fifo underrun
  • DMA2D enabled, double buffer, 30 prescaler: 12.80MHz: flickering, no tearing, LTDC error interrupt fifo underrun
  • DMA2D enabled, double buffer, 40 prescaler: 9.60MHz: flickering at lower rate, no tearing, didn't check fifo underrun

Using software rendering or DMA2D doesn't make a difference.
Enabling D-cache or not doesn't make a difference.
Using a lower clock for DMA2D (HCLK3) doesn't make a difference.
RAM fifo size other than 1 doesn't make a difference.
Adding hal.lockDMAToFrontPorch(true) doesn't help either.

I'm looking for a way to get the LTDC run without fifo underrun errors.

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.

l solved at least one issue: I figured out how to write to the configuration register of the HYPERRAM:

typedef union
{
	uint16_t reg;
	struct
	{
		unsigned int burstLength:2;
		bool hybridBurstEnable:1;
		bool fixedLatencyEnable:1;
		unsigned int initialLatency:4;
		unsigned int reserved:4;
		unsigned int driveStrength:3;
		bool deepPowerDownEnable:1;
	};
}CR0_t;
#if 1
 ///write to CR0
  volatile CR0_t CR0_new = {  .burstLength = 3,
		  	  	  	  	  	  .hybridBurstEnable = true,
							  .fixedLatencyEnable = true,
							  .initialLatency = 15, // 1111 - 4 Clock latency @ 104 MHz Max frequency
							  .reserved = 15,
							  .driveStrength = 3, // 011 - 46 ohm
							  .deepPowerDownEnable = true};

  sHyperBusCfg.WriteZeroLatency = HAL_OSPI_NO_LATENCY_ON_WRITE;
  if (HAL_OSPI_HyperbusCfg(&hospi2, &sHyperBusCfg, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }

  sCommand.DQSMode      = HAL_OSPI_DQS_DISABLE;
  sCommand.Address      = 0x800*2; //CR0
  sCommand.NbData       = 2;
  if (HAL_OSPI_HyperbusCmd(&hospi2, &sCommand, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
  status = HAL_OSPI_Transmit(&hospi2, (uint8_t*)&CR0_new, HAL_OSPI_TIMEOUT_DEFAULT_VALUE);


  sHyperBusCfg.RWRecoveryTime = 4;
  sHyperBusCfg.AccessTime = 4;
  sHyperBusCfg.WriteZeroLatency = HAL_OSPI_LATENCY_ON_WRITE;
  if (HAL_OSPI_HyperbusCfg(&hospi2, &sHyperBusCfg, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
  {
    Error_Handler();
  }
#endif

Now I get higher bandwidth and matched impedance, but I still see tearing.

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.
Pavel A.
Evangelist III

I am aware that it's not portable and may be affected by compiler flags, but I don't plan to change either of those. I now marked the struct as packed. That part is working fine, so I'm not going to change it (at least for now).

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.
unsigned_char_array
Senior III

Glitches and tearing have now been fixed.
Glitches were caused by overloading the memory bus causing LTDC fifo underruns and also caused by uncleared D-Cache (this showed up as artifacts in one of the two frame buffers).

Overloading memory bus was caused by other code accidentally changing OSPI configuration for HYPERRAM and lowered the clock frequency to 22MHz. This is now been fixed. The HYPERRAM runs without issues on 100MHz now.
I had to enable D-Cache to get optimal performance. I also have to manually clean the D-Cache:

 

 

void TouchGFXHAL::flushFrameBuffer(const touchgfx::Rect& rect)
{
    // Calling parent implementation of flushFrameBuffer(const touchgfx::Rect& rect).
    //
    // To overwrite the generated implementation, omit call to parent function
    // and implemented needed functionality here.
    // Please note, HAL::flushFrameBuffer(const touchgfx::Rect& rect) must
    // be called to notify the touchgfx framework that flush has been performed.
    // To calculate he start adress of rect,
    // use advanceFrameBufferToRect(uint8_t* fbPtr, const touchgfx::Rect& rect)
    // defined in TouchGFXGeneratedHAL.cpp

	//TouchGFXGeneratedHAL::flushFrameBuffer(rect);

	InvalidateCache(); // needed to prevent artifacts in frame buffer
	HAL::flushFrameBuffer(rect);
}

void TouchGFXHAL::InvalidateCache()
{
    // If the framebuffer is placed in Write Through cached memory (e.g. SRAM) then
    // the DCache must be flushed prior to DMA2D accessing it. That's done
    // using the function SCB_CleanInvalidateDCache(). Remember to enable "CPU Cache" in the
    // "System Core" settings for "Cortex M7" in CubeMX in order for this function call to work.
    if (SCB->CCR & SCB_CCR_DC_Msk)
    {
    	//https://community.st.com/t5/stm32-mcus-touchgfx-and-gui/stm-keep-getting-reset-at-scb-cleaninvalidatedcache-like-15-25/td-p/205024
    	//"also SCB_CleanInvalidateDCache() is inefficient and mostly dangerous. Flushing buffers before passing them to DMA should be done by SCB_CleanDCache_by_Addr()."

		uint8_t bpp = lcdRef.bitDepth() / 8;
		SCB_CleanDCache_by_Addr((uint32_t *)TouchGFXGeneratedHAL::getTFTFrameBuffer(), FRAME_BUFFER_WIDTH * FRAME_BUFFER_HEIGHT * bpp);
    }
}

 

 

I can clock the LTDC pretty high now. I will lower the clock in the release version just in case.
LTDC=32MHz -> no fifo error
LTDC=38.4MHz -> no fifo error
LTDC=42.6MHz -> no fifo error
LTDC=48MHz -> fifo error
42.6 MHz LTDC -> 47.7 FPS

I also tuned the LTDC porches. I reduced horizontal porches and reduced vertical back porch and increased vertical front porch. This way lockDMAToFrontPorch can be used and it will have more time before the LTDC starts accessing the memory. I'm not using lockDMAToFrontPorch at the moment.

The only issue remaining now is calibrating the delay block. It works fine without it, but I want to be 100% sure that no devices in the field will experience RAM issues due to running out of timing spec. If someone can help me with it that would be great. I see the clock pulse during calibration and I get calibration values in the delay block registers, but I don't know what I should be seeing and how the calibration works.

 

 

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.
KDJEM.1
ST Employee

Hello @unsigned_char_array ,

Thank you for updating post and for sharing the solution about  writing to CR0 register and tearing.

About the delay block calibration could you please try to follow these steps:

1- Activate the Free run clock "FRCK"

2- DLYB Calibration 

3- Deactivate the "FRCK" 

May be this example can help you OSPI_PSRAM_MemoryMapped.

Thank you.

Kaouthar

 

 

To give better visibility on the answered topics, please click on Accept as Solution on the reply which solved your issue or answered your question.

unsigned_char_array
Senior III

I followed the instructions in this topic: https://community.st.com/t5/stm32-mcus-products/stm32h7-delay-block-configuration-lngf-does-not-get-set/td-p/157573
I ported the example for DLYB_OSPI_PSRAM_ExhaustiveTuning (not the DLYB_OSPI_NOR_FastTuning example). And I discovered something very interesting. The tuning works. The value from the tuning works if I put it in my init function. But tuning prior to initialization didn't work. I did register by register compare and discovered a severe bug in HAL_OSPI_MspInit/HAL_OSPI_MspDeInit. HAL_OSPI_MspDeInit does NOT fully deinitialize the peripheral, so subsequent calls to HAL_OSPI_MspInit produce different results due to corrupted registers!

I noticed that I had the exact same error when writing a driver for OSPI FLASH about a year ago! The same fix worked. At the end of OCTOSPI2_MspInit I call:

  /* USER CODE BEGIN OCTOSPI2_MspInit 1 */
    // very important! Or calling HAL_OSPI_MspInit() after HAL_OSPI_MspDeInit() 
    // will lead to non-default values in registers
    __HAL_RCC_OSPI2_FORCE_RESET();
    __HAL_RCC_OSPI2_RELEASE_RESET();
  /* USER CODE END OCTOSPI2_MspInit 1 */

 So now I have fixed all problems. I will post a summary post later that will serve as the accepted solution, since the total solution involves several posts.

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.

I still have some questions.

The tuning based on DLYB_OSPI_PSRAM_ExhaustiveTuning works. The algorithm is fairly simple/naive: simply initialize the peripheral with a delay value and do some simple memory tests in indirect mode(using HAL functions instead of memory mapped mode). Then deinit the peripheral and repeat the test for other delay values. The original algorithm simply tested every value 100 times. I modified it so it initially finds a course min and max value where it works once (max value is the last value that worked before failing after it has found a min value that worked) and then tests these min and max values 100 times to confirm if they are stable. If not stable it increases the min or decreases the max value. Then I take the center value as my setting. This unit value is around 51 for sel=1, which is what I expect for 1/4th cycle+propagation delay (2929ps theoretical, 2983ps calculated from calibration values and typical unit delay).

My questions are about the DLYB_OSPI_NOR_FastTuning/OSPI_PSRAM_MemoryMapped. algorithm.

I do see a free running clock during calibration on my logic analyzer but only sporadically RWDS/DQS. Slave select not is high all the time. I cannot imagine this working. I get very high calibration results: sel=10, unit=30. Which is an order of magnitude too high. Can someone help me out with this? I'd like to get that one working too.


EDIT: I now now why RWDS/DQS is sporadic on my logic analyzer: the signal voltage is cut in half during calibration. Setting the logic analyzer to 1.8V produces a continuous RWDS signal. This only occurs during calibration and somehow even occurs when I hold the RAM in reset. Something very strange is going on.
EDIT2: Most of this was a measuring error. The Saleae Logic probes cause too many issues. Would have been better if they were coax.

Below is the calibration that isn't working:

 

 

LL_DLYB_CfgTypeDef calibrateRam2()
{
	LL_DLYB_CfgTypeDef dlyb_cfg, dlyb_cfg_test;


	//GPIO_InitTypeDef GPIO_InitStruct = {0};



	MX_OCTOSPI2_Init_LowLatency();


	//    GPIO_InitStruct.Pin = MCU_HYPERBUS_NCS_Pin;
	//    GPIO_InitStruct.Mode = GPIO_MODE_OUTPUT_PP;// GPIO_MODE_AF_PP;
	//    GPIO_InitStruct.Pull = GPIO_NOPULL;
	//    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_VERY_HIGH;
	//    GPIO_InitStruct.Alternate = 0;//GPIO_AF3_OCTOSPIM_P2;
	//    HAL_GPIO_Init(MCU_HYPERBUS_NCS_GPIO_Port, &GPIO_InitStruct);


	if (HAL_OSPI_DLYB_GetClockPeriod(&hospi2,&dlyb_cfg) != HAL_OK)
	{
		//BSP_LED_On(LED_RED);
	}

	//
	//    GPIO_InitStruct.Pin = MCU_HYPERBUS_NCS_Pin;
	//    GPIO_InitStruct.Mode =  GPIO_MODE_AF_PP;
	//    GPIO_InitStruct.Pull = GPIO_NOPULL;
	//    GPIO_InitStruct.Speed = GPIO_SPEED_FREQ_VERY_HIGH;
	//    GPIO_InitStruct.Alternate = GPIO_AF3_OCTOSPIM_P2;
	//    HAL_GPIO_Init(MCU_HYPERBUS_NCS_GPIO_Port, &GPIO_InitStruct);


	/*when DTR, PhaseSel is divided by 4 (emperic value)*/
	dlyb_cfg.PhaseSel /=4;

	/* save the present configuration for check*/
	dlyb_cfg_test = dlyb_cfg;

	/*set delay block configuration*/
	//	HAL_XSPI_DLYB_SetConfig(&hospi2, &dlyb_cfg);
	HAL_OSPI_DLYB_SetConfig(&hospi2, &dlyb_cfg);

	/*check the set value*/
	//	HAL_XSPI_DLYB_GetConfig(&hospi2, &dlyb_cfg);
	HAL_OSPI_DLYB_GetConfig(&hospi2, &dlyb_cfg);
	if ((dlyb_cfg.PhaseSel != dlyb_cfg_test.PhaseSel) || (dlyb_cfg.Units != dlyb_cfg_test.Units))
	{
		//BSP_LED_On(LED_RED);
	}

	HAL_OSPI_DeInit(&hospi2);


	return dlyb_cfg;
}

static void MX_OCTOSPI2_Init_LowLatency(void)
{
	OSPI_HyperbusCfgTypeDef sHyperBusCfg = {0};

	LL_DLYB_CfgTypeDef dlyb_cfg = {0};

	MX_OCTOSPI2_Init();

	sHyperBusCfg.RWRecoveryTime = 4;
	sHyperBusCfg.AccessTime = 4;
	sHyperBusCfg.WriteZeroLatency = HAL_OSPI_LATENCY_ON_WRITE;
	sHyperBusCfg.LatencyMode = HAL_OSPI_FIXED_LATENCY;
	if (HAL_OSPI_HyperbusCfg(&hospi2, &sHyperBusCfg, HAL_OSPI_TIMEOUT_DEFAULT_VALUE) != HAL_OK)
	{
		Error_Handler();
	}

	dlyb_cfg.Units = 0;
	dlyb_cfg.PhaseSel = 0;
	if (HAL_OSPI_DLYB_SetConfig(&hospi2, &dlyb_cfg) != HAL_OK)
	{
		Error_Handler();
	}
}

void LL_DLYB_SetDelay(DLYB_TypeDef* DLYBx, LL_DLYB_CfgTypeDef  *pdlyb_cfg)
{
	DLYBx->CR   = 0U;
	DLYBx->CR   = DLYB_CR_DEN | DLYB_CR_SEN;
	DLYBx->CFGR = pdlyb_cfg->PhaseSel | (pdlyb_cfg->Units << DLYB_CFGR_UNIT_Pos);
	DLYBx->CR   = DLYB_CR_DEN;
}


__STATIC_INLINE void LL_DLYB_Enable(DLYB_TypeDef *DLYBx)
{
	SET_BIT(DLYBx->CR, DLYB_CR_DEN);
}


static HAL_StatusTypeDef HAL_OSPI_DLYB_SetConfig(OSPI_HandleTypeDef *hospi, LL_DLYB_CfgTypeDef  *pdlyb_cfg)
{
	HAL_StatusTypeDef status = HAL_ERROR;

	/* Enable OCTOSPI Free Running Clock (mandatory) */
	SET_BIT(hospi->Instance->DCR1, OCTOSPI_DCR1_FRCK);

	/* Update OCTOSPI state */
	hospi->State = HAL_OSPI_STATE_BUSY_CMD;

	if (hospi->Instance == OCTOSPI1)
	{
		/* Enable the DelayBlock */
		LL_DLYB_Enable(DLYB_OCTOSPI1);

		/* Set the Delay Block configuration */
		LL_DLYB_SetDelay(DLYB_OCTOSPI1, pdlyb_cfg);
		status = HAL_OK;
	}

#if defined (OCTOSPI2)
	else if (hospi->Instance == OCTOSPI2)
	{
		/* Enable the DelayBlock */
		LL_DLYB_Enable(DLYB_OCTOSPI2);

		/* Set the Delay Block configuration */
		LL_DLYB_SetDelay(DLYB_OCTOSPI2, pdlyb_cfg);
		status = HAL_OK;
	}
#endif /* OCTOSPI2 */

	else
	{
		/* Nothing to do */
	}

	/* Abort the current OCTOSPI operation if exist */
	(void)HAL_OSPI_Abort(hospi);

	/* Disable Free Running Clock */
	CLEAR_BIT(hospi->Instance->DCR1, OCTOSPI_DCR1_FRCK);

	/* Return function status */
	return status;
}

/**
 * @brief  Get the clock period.
 *   DLYBx: Pointer to DLYB instance.
 *   pdlyb_cfg: Pointer to DLYB configuration structure.
 * @retval An ErrorStatus enumeration value:
 *          - SUCCESS: there is a valid period detected and stored in pdlyb_cfg.
 *          - ERROR: there is no valid period detected.
 */
uint32_t LL_DLYB_GetClockPeriod(DLYB_TypeDef *DLYBx, LL_DLYB_CfgTypeDef *pdlyb_cfg)
{
	uint32_t i = 0U;
	uint32_t nb ;
	uint32_t lng ;
	uint32_t tickstart;

	/* Check the DelayBlock instance */
	assert_param(IS_DLYB_ALL_INSTANCE(DLYBx));

	/* Enable the length sampling */
	SET_BIT(DLYBx->CR, DLYB_CR_SEN);


	/* Delay line length detection */
	while (i < DLYB_MAX_UNIT)
	{
		/* Set the Delay of the UNIT(s)*/
		DLYBx->CFGR = DLYB_MAX_SELECT | (i << DLYB_CFGR_UNIT_Pos);

		/* Waiting for a LNG valid value */
		tickstart =  HAL_GetTick();
		while ((DLYBx->CFGR & DLYB_CFGR_LNGF) == 0U)
		{
			if ((HAL_GetTick() - tickstart) >=  DLYB_TIMEOUT)
			{
				/* New check to avoid false timeout detection in case of preemption */
				if ((DLYBx->CFGR & DLYB_CFGR_LNGF) == 0U)
				{
					return (uint32_t) HAL_TIMEOUT;
				}
			}
		}

		if ((DLYBx->CFGR & DLYB_LNG_10_0_MASK) != 0U)
		{
			if ((DLYBx->CFGR & (DLYB_CFGR_LNG_11 | DLYB_CFGR_LNG_10)) != DLYB_LNG_11_10_MASK)
			{
				/* Delay line length is configured to one input clock period*/
				break;
			}
		}
		i++;
	}

	if (DLYB_MAX_UNIT != i)
	{
		/* Determine how many unit delays (nb) span one input clock period */
		lng = (DLYBx->CFGR & DLYB_CFGR_LNG) >> 16U;
		nb = 10U;
		while ((nb > 0U) && ((lng >> nb) == 0U))
		{
			nb--;
		}
		if (nb != 0U)
		{
			pdlyb_cfg->PhaseSel = nb ;
			pdlyb_cfg->Units = i ;

			/* Disable the length sampling */
			DLYBx->CR = DLYB_CR_SEN;

			return (uint32_t)SUCCESS;
		}
	}

	/* Disable the length sampling */
	DLYBx->CR = DLYB_CR_SEN;

	return (uint32_t)ERROR;

}

__STATIC_INLINE void LL_DLYB_Disable(DLYB_TypeDef *DLYBx)
{
	CLEAR_BIT(DLYBx->CR, DLYB_CR_DEN);
}

HAL_StatusTypeDef HAL_OSPI_DLYB_GetClockPeriod(OSPI_HandleTypeDef *hospi, LL_DLYB_CfgTypeDef  *const pdlyb_cfg)
{
	HAL_StatusTypeDef status = HAL_ERROR;

	/* Enable XSPI Free Running Clock (mandatory) */
	//  SET_BIT(hxspi->Instance->DCR1, XSPI_DCR1_FRCK);
	SET_BIT(hospi->Instance->DCR1, OCTOSPI_DCR1_FRCK);



	/* Update XSPI state */
	//  hxspi->State = HAL_XSPI_STATE_BUSY_CMD;
	hospi->State = HAL_OSPI_STATE_BUSY_CMD;

	if (hospi->Instance == OCTOSPI2)
	{
		/* Enable the DelayBlock */
		LL_DLYB_Enable(DLYB_OCTOSPI2);

		/* try to detect Period */
		if (LL_DLYB_GetClockPeriod(DLYB_OCTOSPI2, pdlyb_cfg) == (uint32_t)SUCCESS)
		{
			status = HAL_OK;
		}

		/* Disable the DelayBlock */
		LL_DLYB_Disable(DLYB_OCTOSPI2);
	}
	else
	{
		//    hxspi->ErrorCode |= HAL_XSPI_ERROR_INVALID_PARAM;
		hospi->ErrorCode |= HAL_OSPI_ERROR_INVALID_PARAM;
	}

	/* Abort the current XSPI operation if exist */
	//  (void)HAL_XSPI_Abort(hxspi);
	HAL_OSPI_Abort(hospi);

	/* Disable Free Running Clock */
	//  CLEAR_BIT(hxspi->Instance->DCR1, XSPI_DCR1_FRCK);
	CLEAR_BIT(hospi->Instance->DCR1, OCTOSPI_DCR1_FRCK);

	return status;
}

void LL_DLYB_GetDelay(DLYB_TypeDef *DLYBx, LL_DLYB_CfgTypeDef *pdlyb_cfg)
{
	/* Check the DelayBlock instance */
	assert_param(IS_DLYB_ALL_INSTANCE(DLYBx));

	/* Fill the DelayBlock configuration structure with SEL and UNIT value */
	pdlyb_cfg->Units = ((DLYBx->CFGR & DLYB_CFGR_UNIT) >> DLYB_CFGR_UNIT_Pos);
	pdlyb_cfg->PhaseSel = (DLYBx->CFGR & DLYB_CFGR_SEL);
}

HAL_StatusTypeDef HAL_OSPI_DLYB_GetConfig(OSPI_HandleTypeDef *hospi, LL_DLYB_CfgTypeDef  *const pdlyb_cfg)
{
	HAL_StatusTypeDef status = HAL_ERROR;

	if (hospi->Instance == OCTOSPI2)
	{
		LL_DLYB_GetDelay(DLYB_OCTOSPI2, pdlyb_cfg);
		status = HAL_OK;
	}
	else
	{
		hospi->ErrorCode |= HAL_OSPI_ERROR_INVALID_PARAM;
	}

	return status;
}

 

 

 

 

 

 

Kudo posts if you have the same problem and kudo replies if the solution works.
Click "Accept as Solution" if a reply solved your problem. If no solution was posted please answer with your own.