cancel
Showing results for 
Search instead for 
Did you mean: 

STM32H750 assembly: same delay loops, but one takes 5x longer?

bully
Senior

Hello,

I have simple project in assembly and can't understand/explain why one time delay loop takes much longer than the other one (LEDs should be blinking one on, second off and vice versa). But one such state takes approx. 5 times more time than the other, although loops seem to be written equally demanding

Maybe I'm doing something obvious wrong. BTW, we have also spotted such behaviour also on some other projects - one loop taking much more time, although it should take same time. I've tried running the code from Flash or RAM and same happens in both situations.

Any hint, advice? Thanks... 

 

Short description: main loop has two identical SW delay loops, in between on/off state is written to 3 variables. SysTick Handler is triggered each ms and reads the state of variables and reflects to real outputs. Green LED is on approx. 5 times longer that red one.

 

/*
 * Main.s
  */


  .syntax unified
  .cpu cortex-m7
  .thumb


///////////////////////////////////////////////////////////////////////////////
// Definitions
///////////////////////////////////////////////////////////////////////////////
// Definitions section. Define all the registers and
// constants here for code readability.

// Constants

	.equ     LEDDELAY,      6400

// For LOOPTC Software delay
// By default 64MHz internal HSI clock is enabled
// Internal loop takes N cycles

// Register Addresses
// You can find the base addresses for all peripherals from Memory Map section 2.3.2
// RM0433 on page 131. Then the offsets can be found on their relevant sections.

// RCC   base address is 0x58024400
//   AHB4ENR register offset is 0xE0
	.equ     RCC_AHB4ENR,   0x580244E0 // RCC AHB4 peripheral clock reg

// GPIOA base address is 0x58020000
	.equ     GPIOA_BASE,   0x58020000 // GPIOI base address)

// GPIOI base address is 0x58022000
	.equ     GPIOI_BASE,   0x58022000 // GPIOI base address)

// GPIOJ base address is 0x58022000
	.equ     GPIOJ_BASE,   0x58022400 // GPIOJ base address)

//   MODER register offset is 0x00
	.equ     GPIOx_MODER,   0x00 // GPIOx port mode register
//   ODR   register offset is 0x14
	.equ     GPIOx_ODR,     0x14 // GPIOx output data register
//   BSSR   register offset is 0x18
	.equ     GPIOx_BSRR,     0x18 // GPIOx port set/reset register


// Values for BSRR register - pin PI13: LED is on, when GPIO is off (Red)
	.equ     LED2_OFF,       0x00002000   	// Setting pin to 1 -> LED is off
	.equ     LED2_ON,   	 0x20000000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PJ2: LED is on, when GPIO is off (Green)
	.equ     LED1_OFF,       0x00000004   	// Setting pin to 1 -> LED is off
	.equ     LED1_ON,   	 0x00040000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PA3: PA3
	.equ     PA3_ON,         0x00000008   	// Setting pin to 1
	.equ     PA3_OFF,        0x00080000   	// Setting pin to 0

// Vector table offset register definition
// Important for relocated Vector table on running from RAM
	.equ VTOR,0xE000ED08

// SysTick Timer definitions
	.equ     SCS_BASE,0xe000e000
	.equ     SCS_SYST_CSR,0x10// Control/Status register
	.equ     SCS_SYST_RVR,0x14// Value to countdown from
	.equ     SCS_SYST_CVR,0x18// Current value

	.equ	 SYSTICK_RELOAD_1MS,	63999  //1 msec at 64MHz ...


// Start of data section
 		.data

 		.align

LED1:   .word   0		// LED1 State (Green)
LED2:   .word   0		// LED2 State (Red)
PA3:    .word   0		// PA3 pin State



// Start of text section
  .text

  .type  main, %function
  .global main

   	   	.align
main:
 
	    bl 	INIT        // Priprava V/I in sistemskih naprav za kontrolo LED diod in PA3

        ldr r1,=LED1
        ldr r2,=LED2
        ldr r3,=PA3

        mov r4,#0xff    // LED(Pin) On value
        mov r5,#0       // LED(Pin) Off value

loop:

		str r4,[r1]     // Vklop LED1 diode  (Green)
		str r5,[r2]     // Izklop LED2 diode (Red)
		str r4,[r3]     // Vklop PA3
//        bl  WRITEOUT    // Prenesi na prikljucke

@      delay half cycle
        mov r0,#500
ZAN1:   ldr r6, =LEDDELAY
ZAN1n:  subs r6, r6,#1
        bne ZAN1n
        subs r0,r0,#1
        bne ZAN1

		str r5,[r1]     // Izklop LED1 diode  (Green)
		str r4,[r2]     // Vklop LED2 diode   (Red)
		str r5,[r3]     // Izklop PA3
//        bl  WRITEOUT    // Prenesi na prikljucke

@      delay half cycle
       mov r0,#500
ZAN2:  ldr r6, =LEDDELAY
ZAN2n: subs r6, r6,#1
       bne ZAN2n
       subs r0,r0,#1
       bne ZAN2

		b loop          // skok na vrstico loop:


__end: 	b 	__end


INIT:
  		push {r0,r1,lr}

        bl INIT_IO

//      If running code from FLASH comment next 3 lines!!!
        ldr r1, =VTOR // Set Vector table addr. to 0x24000000
		ldr r0, =0x24000000
		str r0, [r1]

		bl INIT_TC_PSP // Priprava SysTick časovnika s prek

	  	pop {r0,r1,pc}

INIT_IO:
  	push {r5, r6, lr}

	// Enable GPIOA,I,J Peripheral Clock (bit 8 in AHB4ENR register)
	ldr r6, = RCC_AHB4ENR       // Load peripheral clock reg address to r6
	ldr r5, [r6]                // Read its content to r5
	orr r5, #0x00000300         // Set bits 8 and 9 to enable GPIOI,J clock
	orr r5, #0x00000001         // Set bits 1 to enable GPIOA clock
	str r5, [r6]                // Store result in peripheral clock register

	// Make GPIOA Pin3 as output pin (bits 7:6 in MODER register)
	ldr r6, =GPIOA_BASE       // Load GPIOA BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOA_MODER content to r5
	and r5, #0xFFFFFF3F          // Clear bits 7-6 for PA3
	orr r5, #0x00000040          // Write 01 to bits 7-6 for PA3
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOI Pin13 as output pin (bits 27:26 in MODER register)
	ldr r6, =GPIOI_BASE       // Load GPIOI BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOI_MODER content to r5
	and r5, #0xF3FFFFFF          // Clear bits 27-26 for P13
	orr r5, #0x04000000          // Write 01 to bits 27-26 for P13
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOJ Pin2 as output pin (bits 5:4 in MODER register)
	ldr r6, =GPIOJ_BASE       // Load GPIOJ BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOJ_MODER content to r5
	and r5, #0xFFFFFFCF          // Clear bits 5-4 for P2
	orr r5, #0x00000010          // Write 01 to bits 5-4 for PJ2
	str r5, [r6]                // Store result in GPIO MODER register

  	pop {r5, r6, pc}

INIT_TC_PSP:
	  	push {r0, r1, lr}
		ldr r1, =SCS_BASE

		ldr r0, =SYSTICK_RELOAD_1MS
		str r0, [r1, #SCS_SYST_RVR]

		mov r0, #0
		str r0, [r1, #SCS_SYST_CVR]

		mov r0, #0b111    // Set TickInt to 1 as well
		str r0, [r1, #SCS_SYST_CSR]

	  	pop {r0, r1, pc}

.global SysTick_Handler
.section .text.SysTick_Handler,"ax",%progbits
.type SysTick_Handler, %function

SysTick_Handler:

		push {r3, r4, r5, r6, lr}

// -----------------------------------
//      Set LED1 from LED1 variable
		ldr r3,=LED1 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L1ON

		mov r5, #LED1_OFF
		b   CONT1
L1ON: 	mov r5, #LED1_ON

CONT1:  // Set GPIOJ Pins through BSRR register
		ldr r6, =GPIOJ_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set LED2 from LED2 variable
		ldr r3,=LED2 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L2ON

		mov r5, #LED2_OFF
		b   CONT2
L2ON: 	mov r5, #LED2_ON

CONT2:  // Set GPIOI Pins through BSRR register
		ldr r6, =GPIOI_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set PA3 from PA3 variable
		ldr r3,=PA3 // Load PA3 value
		ldr r4,[r3]

		cmp r4,#0
		beq L3ON

		mov r5, #PA3_OFF
		b   CONT3
L3ON: 	mov r5, #PA3_ON

CONT3:  // Set GPIOA Pins through BSRR register
		ldr r6, =GPIOA_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

RET: 	pop {r3, r4, r5, r6, pc }

 

3 REPLIES 3

When checking timing, drive the pins directly so you can scope them. I'm not sure of the value of establishing different beat frequencies from setting variables, and then actioning them on a 1 KHz tick.

You can measure cycles via DWT CYCCNT

Check alignments of branch targets.

Perhaps put the delay in subroutine, to confirm that the same code in the same location has consistent behaviour.

Tips, Buy me a coffee, or three.. PayPal Venmo
Up vote any posts that you find helpful, it shows what's working..
bully
Senior

Hello,

thanks for tips.

Can I ask for more info about "alignments of branch targets"?

 

I've extended the code with CYCCNT measurements, but I guess I'm doing something wrong, because it measures only for the first time, then it seems to repeat same value.

Is there any more detailed docs about DWT counters? I'm also interested in other counters for pipeline performance. I have never suceeded to read others - only CYCCNT and only first time in the code...

 

Thanks.

/*
 * Main.s
 */


  .syntax unified
  .cpu cortex-m7
  .thumb


///////////////////////////////////////////////////////////////////////////////
// Definitions
///////////////////////////////////////////////////////////////////////////////
// Definitions section. Define all the registers and
// constants here for code readability.

// Constants

	.equ     LEDDELAY,      64000

// For LOOPTC Software delay
// By default 64MHz internal HSI clock is enabled
// Internal loop takes N cycles

// Register Addresses
// You can find the base addresses for all peripherals from Memory Map section 2.3.2
// RM0433 on page 131. Then the offsets can be found on their relevant sections.

// RCC   base address is 0x58024400
//   AHB4ENR register offset is 0xE0
	.equ     RCC_AHB4ENR,   0x580244E0 // RCC AHB4 peripheral clock reg

// GPIOA base address is 0x58020000
	.equ     GPIOA_BASE,   0x58020000 // GPIOI base address)

// GPIOI base address is 0x58022000
	.equ     GPIOI_BASE,   0x58022000 // GPIOI base address)

// GPIOJ base address is 0x58022000
	.equ     GPIOJ_BASE,   0x58022400 // GPIOJ base address)

//   MODER register offset is 0x00
	.equ     GPIOx_MODER,   0x00 // GPIOx port mode register
//   ODR   register offset is 0x14
	.equ     GPIOx_ODR,     0x14 // GPIOx output data register
//   BSSR   register offset is 0x18
	.equ     GPIOx_BSRR,     0x18 // GPIOx port set/reset register


// Values for BSRR register - pin PI13: LED is on, when GPIO is off (Red)
	.equ     LED2_OFF,       0x00002000   	// Setting pin to 1 -> LED is off
	.equ     LED2_ON,   	 0x20000000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PJ2: LED is on, when GPIO is off (Green)
	.equ     LED1_OFF,       0x00000004   	// Setting pin to 1 -> LED is off
	.equ     LED1_ON,   	 0x00040000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PA3: PA3
	.equ     PA3_ON,         0x00000008   	// Setting pin to 1
	.equ     PA3_OFF,        0x00080000   	// Setting pin to 0

// Vector table offset register definition
// Important for relocated Vector table on running from RAM
	.equ VTOR,0xE000ED08

// SysTick Timer definitions
	.equ     SCS_BASE,0xe000e000
	.equ     SCS_SYST_CSR,0x10// Control/Status register
	.equ     SCS_SYST_RVR,0x14// Value to countdown from
	.equ     SCS_SYST_CVR,0x18// Current value

	.equ	 SYSTICK_RELOAD_1MS,	63999  //1 msec at 64MHz ...

// Register Addresses

	.equ     DWT_BASE,   	0xE0001000 // DWT Base address

	.equ     DWT_CTRL,   	0x00 // DWT_CTRL   reg (RM0433, pp.3209)
	.equ     DWT_CYCCNT,   	0x04 // increments on each clock cycle when the processor is not halted in debug state.
	.equ     DWT_CPICNT,   	0x08 // additional cycles required to execute multi-cycle instructions, and instruction fetch stalls
	.equ     DWT_EXCCNT,   	0x0C // count the total cycles spent in interrupt processing (cycles spent performing exception entry and exit procedures)
	.equ     DWT_SLPCNT,   	0x10 // count the total number of cycles during which the processor is sleeping (cycles spent sleeping)
	.equ     DWT_LSUCNT,   	0x14 // counts the total number of cycles that the processor is processing an LSU operation (cycles spent waiting for loads and stores to complete)
								 // For example, an LDR that takes two cycles to complete increments this counter one cycle.
								 // Equivalently, an LDR that stalls for two cycles (and so takes four cycles), increments counter three times.
	.equ     DWT_FOLDCNT,   0x18 // count the total number of folded instructions (cycles saved by instructions which execute in zero cycles)
								 // This counts 1 for each instruction that takes 0 cycles.

// If the processor configuration includes the DWT profiling counters, the instruction count can be calculated as:

// instructions executed = DWT_CYCCNT - DWT_CPICNT - DWT_EXCCNT - DWT_SLEEPCNT - DWT_LSUCNT + DWT_FOLDCNT


	.equ     DWT_LAR,   	0xFB0 // DWT_LAR  DWT_LAR = 0xC5ACCE55; // unlock (CM7)
	.equ     DEMCR,   	    0xE000EDFC // SCB_DEMCR |= 0x01000000;

// Start of data section
 		.data

 		.align

LED1:   .word   0		// LED1 State (Green)
LED2:   .word   0		// LED2 State (Red)
PA3:    .word   0		// PA3 pin State



// Start of text section
  .text

  .type  main, %function
  .global main

   	   	.align
main:
 
	    bl 	INIT        // Priprava V/I in sistemskih naprav za kontrolo LED diod in PA3

		bl  INIT_CNT
		ldr r0, =DWT_BASE

        ldr r1,=LED1
        ldr r2,=LED2
        ldr r3,=PA3

        mov r4,#0xff    // LED(Pin) On value
        mov r5,#0       // LED(Pin) Off value

loop:

		str r4,[r1]     // Vklop LED1 diode  (Green)
		str r5,[r2]     // Izklop LED2 diode (Red)
		str r4,[r3]     // Vklop PA3
//        bl  WRITEOUT    // Prenesi na prikljucke

		bl  INIT_CNT
		bl RESET_CNT
		bl ENABLE_CNT

		// Read DWT Counter before value
		ldr r8, [r0,#DWT_CYCCNT]

@      delay half cycle
        mov r0,#500
ZAN1:   ldr r6, =LEDDELAY
ZAN1n:  subs r6, r6,#1
        bne ZAN1n
        subs r0,r0,#1
        bne ZAN1

// Read DWT Counter after value
		ldr r10, [r0,#DWT_CYCCNT]

		sub r8,r10,r8   // Difference in r0


		str r5,[r1]     // Izklop LED1 diode  (Green)
		str r4,[r2]     // Vklop LED2 diode   (Red)
		str r5,[r3]     // Izklop PA3
//        bl  WRITEOUT    // Prenesi na prikljucke


		bl  INIT_CNT
		bl RESET_CNT
		bl ENABLE_CNT

		// Read DWT Counter before value
		ldr r9, [r0,#DWT_CYCCNT]

@      delay half cycle
       mov r0,#500
ZAN2:  ldr r6, =LEDDELAY
ZAN2n: subs r6, r6,#1
       bne ZAN2n
       subs r0,r0,#1
       bne ZAN2

 // Read DWT Counter after value
		ldr r10, [r0,#DWT_CYCCNT]

		sub r11,r10,r9   // Difference in r0


		b loop          // skok na vrstico loop:


__end: 	b 	__end


INIT:
  		push {r0,r1,lr}

        bl INIT_IO

//      If running code from FLASH comment next 3 lines!!!
        ldr r1, =VTOR // Set Vector table addr. to 0x24000000
		ldr r0, =0x24000000
		str r0, [r1]

		bl INIT_TC_PSP // Priprava SysTick časovnika s prek

	  	pop {r0,r1,pc}

INIT_IO:
  	push {r5, r6, lr}

	// Enable GPIOA,I,J Peripheral Clock (bit 8 in AHB4ENR register)
	ldr r6, = RCC_AHB4ENR       // Load peripheral clock reg address to r6
	ldr r5, [r6]                // Read its content to r5
	orr r5, #0x00000300         // Set bits 8 and 9 to enable GPIOI,J clock
	orr r5, #0x00000001         // Set bits 1 to enable GPIOA clock
	str r5, [r6]                // Store result in peripheral clock register

	// Make GPIOA Pin3 as output pin (bits 7:6 in MODER register)
	ldr r6, =GPIOA_BASE       // Load GPIOA BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOA_MODER content to r5
	and r5, #0xFFFFFF3F          // Clear bits 7-6 for PA3
	orr r5, #0x00000040          // Write 01 to bits 7-6 for PA3
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOI Pin13 as output pin (bits 27:26 in MODER register)
	ldr r6, =GPIOI_BASE       // Load GPIOI BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOI_MODER content to r5
	and r5, #0xF3FFFFFF          // Clear bits 27-26 for P13
	orr r5, #0x04000000          // Write 01 to bits 27-26 for P13
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOJ Pin2 as output pin (bits 5:4 in MODER register)
	ldr r6, =GPIOJ_BASE       // Load GPIOJ BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOJ_MODER content to r5
	and r5, #0xFFFFFFCF          // Clear bits 5-4 for P2
	orr r5, #0x00000010          // Write 01 to bits 5-4 for PJ2
	str r5, [r6]                // Store result in GPIO MODER register

  	pop {r5, r6, pc}

INIT_TC_PSP:
	  	push {r0, r1, lr}
		ldr r1, =SCS_BASE

		ldr r0, =SYSTICK_RELOAD_1MS
		str r0, [r1, #SCS_SYST_RVR]

		mov r0, #0
		str r0, [r1, #SCS_SYST_CVR]

		mov r0, #0b111    // Set TickInt to 1 as well
		str r0, [r1, #SCS_SYST_CSR]

	  	pop {r0, r1, pc}

.global SysTick_Handler
.section .text.SysTick_Handler,"ax",%progbits
.type SysTick_Handler, %function

SysTick_Handler:

		push {r3, r4, r5, r6, lr}

// -----------------------------------
//      Set LED1 from LED1 variable
		ldr r3,=LED1 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L1ON

		mov r5, #LED1_OFF
		b   CONT1
L1ON: 	mov r5, #LED1_ON

CONT1:  // Set GPIOJ Pins through BSRR register
		ldr r6, =GPIOJ_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set LED2 from LED2 variable
		ldr r3,=LED2 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L2ON

		mov r5, #LED2_OFF
		b   CONT2
L2ON: 	mov r5, #LED2_ON

CONT2:  // Set GPIOI Pins through BSRR register
		ldr r6, =GPIOI_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set PA3 from PA3 variable
		ldr r3,=PA3 // Load PA3 value
		ldr r4,[r3]

		cmp r4,#0
		beq L3ON

		mov r5, #PA3_OFF
		b   CONT3
L3ON: 	mov r5, #PA3_ON

CONT3:  // Set GPIOA Pins through BSRR register
		ldr r6, =GPIOA_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

RET: 	pop {r3, r4, r5, r6, pc }

INIT_CNT:
	  	push {r0-r2, lr}


		// Added in 2024 :
		ldr r1,=DWT_BASE
		ldr r0,=0xC5ACCE55
		str r0,[r1,#DWT_LAR]   // *DWT_LAR = 0xC5ACCE55; // unlock (CM7)

		ldr r1,=DEMCR
		ldr r0,[r1]
		orr r0,r0,#0x01000000
		str r0,[r1]            // *SCB_DEMCR |= 0x01000000;
		// End: Added in 2024 :


    	mov r0,#0
    	ldr r1, =DWT_BASE

// Disable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		bic r2,r2,#1      // Disabling CYCCNTENA bit
		str r2, [r1,#DWT_CTRL]

// Reset DWT Counters
		str r0, [r1,#DWT_CYCCNT]
		str r0, [r1,#DWT_CPICNT]
		str r0, [r1,#DWT_EXCCNT]
		str r0, [r1,#DWT_SLPCNT]
		str r0, [r1,#DWT_LSUCNT]
		str r0, [r1,#DWT_FOLDCNT]

	  	pop {r0-r2, pc}

RESET_CNT:
	  	push {r0-r2, lr}

	  	mov r0,#0
	  	ldr r1, =DWT_BASE


// Disable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		bic r2,r2,#1      // Disabling CYCCNTENA bit
		str r2, [r1,#DWT_CTRL]

// Reset DWT Counters
		str r0, [r1,#DWT_CYCCNT]
		str r0, [r1,#DWT_CPICNT]
		str r0, [r1,#DWT_EXCCNT]
		str r0, [r1,#DWT_SLPCNT]
		str r0, [r1,#DWT_LSUCNT]
		str r0, [r1,#DWT_FOLDCNT]

	  	pop {r0-r2, pc}

ENABLE_CNT:
	  	push {r0-r2, lr}

		ldr r0,=0xC5ACCE55
		str r0,[r1,#DWT_LAR]   // *DWT_LAR = 0xC5ACCE55; // unlock (CM7)

	  	ldr r1, =DWT_BASE

// Enable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		orr r2,r2,#1      // Enabling CYCCNTENA bit
		str r2, [r1,#DWT_CTRL]

	  	pop {r0-r2, pc}

 

Hello,

I've found some errors in my code, but still cannot explain weird behaviour.

If I uncomment two lines of reading CYCCNT counter before entering SW delay nested loop (there are two such loops in program), then both LEDs blink for the same time.

If I comment thoise sections, green LED is on much longer (aprox. 8x) than red LED (which stays in the same time interval).

I really can't understand or explain such behaviour?

Also, I've managed to activate also other counters and get surprising results.

I've measured 32 050 210 cycles on second delay loop and have calculated number of executed instructions according to formula:



 

instructions executed = DWT_CYCCNT - DWT_CPICNT - DWT_EXCCNT - DWT_SLEEPCNT - DWT_LSUCNT + DWT_FOLDCNT

 

and get 32 049 740 instructions, which is approx. CPI of 1.

But if I calculate briefly, the number in one nested delay loop should be above 64 000 000.

Is this difference because of two-issue pipeline?

 

Anyone has an idea what might be wrong?

Thanks in advance.

 

I'm including corrected code :

 

/*
 * Main.s
 *
 * Comment: both time delays appear equal if
 * at least one of CYCCNT reading two lines codes are uncommented:
 * Read DWT Counter before value 1 or Read DWT Counter before value 2.
 *
 * Otherwise: Green LED is switched on much more time that Red LED.
 */


  .syntax unified
  .cpu cortex-m7
  .thumb


///////////////////////////////////////////////////////////////////////////////
// Definitions
///////////////////////////////////////////////////////////////////////////////
// Definitions section. Define all the registers and
// constants here for code readability.

// Constants

	.equ     LEDDELAY,      64000

// For LOOPTC Software delay
// By default 64MHz internal HSI clock is enabled
// Internal loop takes N cycles

// Register Addresses
// You can find the base addresses for all peripherals from Memory Map section 2.3.2
// RM0433 on page 131. Then the offsets can be found on their relevant sections.

// RCC   base address is 0x58024400
//   AHB4ENR register offset is 0xE0
	.equ     RCC_AHB4ENR,   0x580244E0 // RCC AHB4 peripheral clock reg

// GPIOA base address is 0x58020000
	.equ     GPIOA_BASE,   0x58020000 // GPIOI base address)

// GPIOI base address is 0x58022000
	.equ     GPIOI_BASE,   0x58022000 // GPIOI base address)

// GPIOJ base address is 0x58022000
	.equ     GPIOJ_BASE,   0x58022400 // GPIOJ base address)

//   MODER register offset is 0x00
	.equ     GPIOx_MODER,   0x00 // GPIOx port mode register
//   ODR   register offset is 0x14
	.equ     GPIOx_ODR,     0x14 // GPIOx output data register
//   BSSR   register offset is 0x18
	.equ     GPIOx_BSRR,     0x18 // GPIOx port set/reset register


// Values for BSRR register - pin PI13: LED is on, when GPIO is off (Red)
	.equ     LED2_OFF,       0x00002000   	// Setting pin to 1 -> LED is off
	.equ     LED2_ON,   	 0x20000000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PJ2: LED is on, when GPIO is off (Green)
	.equ     LED1_OFF,       0x00000004   	// Setting pin to 1 -> LED is off
	.equ     LED1_ON,   	 0x00040000   	// Setting pin to 0 -> LED is on

// Values for BSRR register - pin PA3: PA3
	.equ     PA3_ON,         0x00000008   	// Setting pin to 1
	.equ     PA3_OFF,        0x00080000   	// Setting pin to 0

// Vector table offset register definition
// Important for relocated Vector table on running from RAM
	.equ VTOR,0xE000ED08

// SysTick Timer definitions
	.equ     SCS_BASE,0xe000e000
	.equ     SCS_SYST_CSR,0x10// Control/Status register
	.equ     SCS_SYST_RVR,0x14// Value to countdown from
	.equ     SCS_SYST_CVR,0x18// Current value

	.equ	 SYSTICK_RELOAD_1MS,	63999  //1 msec at 64MHz ...

// Register Addresses

	.equ     DWT_BASE,   	0xE0001000 // DWT Base address

	.equ     DWT_CTRL,   	0x00 // DWT_CTRL   reg (RM0433, pp.3209)
	.equ     DWT_CYCCNT,   	0x04 // increments on each clock cycle when the processor is not halted in debug state.
	.equ     DWT_CPICNT,   	0x08 // additional cycles required to execute multi-cycle instructions, and instruction fetch stalls
	.equ     DWT_EXCCNT,   	0x0C // count the total cycles spent in interrupt processing (cycles spent performing exception entry and exit procedures)
	.equ     DWT_SLPCNT,   	0x10 // count the total number of cycles during which the processor is sleeping (cycles spent sleeping)
	.equ     DWT_LSUCNT,   	0x14 // counts the total number of cycles that the processor is processing an LSU operation (cycles spent waiting for loads and stores to complete)
								 // For example, an LDR that takes two cycles to complete increments this counter one cycle.
								 // Equivalently, an LDR that stalls for two cycles (and so takes four cycles), increments counter three times.
	.equ     DWT_FOLDCNT,   0x18 // count the total number of folded instructions (cycles saved by instructions which execute in zero cycles)
								 // This counts 1 for each instruction that takes 0 cycles.

	.equ     DWT_CTRL_ENABLE_CNTs, 0x003f0001  // Enable bits 16-21 and 1
// If the processor configuration includes the DWT profiling counters, the instruction count can be calculated as:

// instructions executed = DWT_CYCCNT - DWT_CPICNT - DWT_EXCCNT - DWT_SLEEPCNT - DWT_LSUCNT + DWT_FOLDCNT
	.equ     DWT_LAR,   	0xFB0 // DWT_LAR  DWT_LAR = 0xC5ACCE55; // unlock (CM7)
	.equ     DEMCR,   	    0xE000EDFC // SCB_DEMCR |= 0x01000000;

// Start of data section
 		.data

 		.align
LED1:   .word   0		// LED1 State (Green)
LED2:   .word   0		// LED2 State (Red)
PA3:    .word   0		// PA3 pin State



// Start of text section
  .text

  .type  main, %function
  .global main

   	   	.align
main:

	    bl 	INIT        // Priprava V/I in sistemskih naprav za kontrolo LED diod in PA3

		//bl  INIT_CNT
		//bl RESET_CNT

        ldr r1,=LED1
        ldr r2,=LED2
        ldr r3,=PA3

        mov r4,#0xff    // LED(Pin) On value
        mov r5,#0       // LED(Pin) Off value

loop:

		str r4,[r1]     // Izklop LED1 diode  (Green)
		str r5,[r2]     // Vklop LED2 diode (Red)
		str r4,[r3]     // Vklop PA3 High
//        bl  WRITEOUT    // Prenesi na prikljucke

		//bl RESET_CNT

		// Read DWT Counter before value 1
		// Uncommenting following 2 lines code works (both delays are same,
		//											otherwise one delay takes much longer ???
		//ldr r0, =DWT_BASE
		//ldr r8, [r0,#DWT_CYCCNT]

@      delay half cycle
        mov r0,#500
ZAN1:   ldr r6, =LEDDELAY
ZAN1n:  subs r6, r6,#1
        bne ZAN1n
        subs r0,r0,#1
        bne ZAN1

// Read DWT Counter after value
		//ldr r0, =DWT_BASE
		//ldr r10, [r0,#DWT_CYCCNT]
		//sub r8,r10,r8   // Difference in r8

// Calculate and read other counters
		//mov r0,r8
		//bl CALC_CNT
		//mov r11,r0    // Number of instr. in r11


		str r5,[r1]     // Vklop LED1 diode  (Green)
		str r4,[r2]     // Izklop LED2 diode   (Red)
		str r5,[r3]     // Izklop PA3 Low
//        bl  WRITEOUT    // Prenesi na prikljucke


		// bl RESET_CNT

		// Read DWT Counter before value 2
		// Uncommenting following 2 lines code works (both delays are same,
		//											otherwise one delay takes much longer ???
		ldr r0, =DWT_BASE
		ldr r9, [r0,#DWT_CYCCNT]

@      delay half cycle
       mov r0,#500
ZAN2:  ldr r6, =LEDDELAY
ZAN2n: subs r6, r6,#1
       bne ZAN2n
       subs r0,r0,#1
       bne ZAN2

 // Read DWT Counter after value
//		ldr r0, =DWT_BASE
//		ldr r10, [r0,#DWT_CYCCNT]

//    	sub r10,r10,r9   // Difference in r10

// Calculate and read other counters
//		mov r0,r10
//		bl CALC_CNT
//		mov r12,r0    // Number of instr. in r12

		b loop          // skok na vrstico loop:


__end: 	b 	__end


INIT:
  		push {r0,r1,lr}

        bl INIT_IO

//      If running code from FLASH comment next 3 lines!!!
        ldr r1, =VTOR // Set Vector table addr. to 0x24000000
		ldr r0, =0x24000000
		str r0, [r1]

		bl INIT_TC_PSP // Priprava SysTick časovnika s prek

	  	pop {r0,r1,pc}

INIT_IO:
  	push {r5, r6, lr}

	// Enable GPIOA,I,J Peripheral Clock (bit 8 in AHB4ENR register)
	ldr r6, = RCC_AHB4ENR       // Load peripheral clock reg address to r6
	ldr r5, [r6]                // Read its content to r5
	orr r5, #0x00000300         // Set bits 8 and 9 to enable GPIOI,J clock
	orr r5, #0x00000001         // Set bits 1 to enable GPIOA clock
	str r5, [r6]                // Store result in peripheral clock register

	// Make GPIOA Pin3 as output pin (bits 7:6 in MODER register)
	ldr r6, =GPIOA_BASE       // Load GPIOA BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOA_MODER content to r5
	and r5, #0xFFFFFF3F          // Clear bits 7-6 for PA3
	orr r5, #0x00000040          // Write 01 to bits 7-6 for PA3
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOI Pin13 as output pin (bits 27:26 in MODER register)
	ldr r6, =GPIOI_BASE       // Load GPIOI BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOI_MODER content to r5
	and r5, #0xF3FFFFFF          // Clear bits 27-26 for P13
	orr r5, #0x04000000          // Write 01 to bits 27-26 for P13
	str r5, [r6]                // Store result in GPIO MODER register

	// Make GPIOJ Pin2 as output pin (bits 5:4 in MODER register)
	ldr r6, =GPIOJ_BASE       // Load GPIOJ BASE address to r6
	ldr r5, [r6,#GPIOx_MODER]  // Read GPIOJ_MODER content to r5
	and r5, #0xFFFFFFCF          // Clear bits 5-4 for P2
	orr r5, #0x00000010          // Write 01 to bits 5-4 for PJ2
	str r5, [r6]                // Store result in GPIO MODER register

  	pop {r5, r6, pc}

INIT_TC_PSP:
	  	push {r0, r1, lr}
		ldr r1, =SCS_BASE

		ldr r0, =SYSTICK_RELOAD_1MS
		str r0, [r1, #SCS_SYST_RVR]

		mov r0, #0
		str r0, [r1, #SCS_SYST_CVR]

		mov r0, #0b111    // Set TickInt to 1 as well
		str r0, [r1, #SCS_SYST_CSR]

	  	pop {r0, r1, pc}

.global SysTick_Handler
.section .text.SysTick_Handler,"ax",%progbits
.type SysTick_Handler, %function

SysTick_Handler:

		push {r3, r4, r5, r6, lr}

// -----------------------------------
//      Set LED1 from LED1 variable
		ldr r3,=LED1 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L1ON

		mov r5, #LED1_OFF
		b   CONT1
L1ON: 	mov r5, #LED1_ON

CONT1:  // Set GPIOJ Pins through BSRR register
		ldr r6, =GPIOJ_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set LED2 from LED2 variable
		ldr r3,=LED2 // Load LED1 value
		ldr r4,[r3]

		cmp r4,#0
		beq L2ON

		mov r5, #LED2_OFF
		b   CONT2
L2ON: 	mov r5, #LED2_ON

CONT2:  // Set GPIOI Pins through BSRR register
		ldr r6, =GPIOI_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

// -----------------------------------
//      Set PA3 from PA3 variable
		ldr r3,=PA3 // Load PA3 value
		ldr r4,[r3]

		cmp r4,#0
		beq L3ON

		mov r5, #PA3_OFF
		b   CONT3
L3ON: 	mov r5, #PA3_ON

CONT3:  // Set GPIOA Pins through BSRR register
		ldr r6, =GPIOA_BASE // Load GPIOD BASE address to r6
		str r5, [r6,#GPIOx_BSRR] // Write to BSRR register

RET: 	pop {r3, r4, r5, r6, pc }

INIT_CNT:
	  	push {r0-r2, lr}


		// Added in 2024 (but it seems not needed for H7):
//		ldr r1,=DWT_BASE
//		ldr r0,=0xC5ACCE55
//		str r0,[r1,#DWT_LAR]   // *DWT_LAR = 0xC5ACCE55; // unlock (CM7)

//		bit [24]	TRCENA Global enable for all DWT and ITM features:
//				0 = DWT and ITM blocks disabled.
//				1 = DWT and ITM blocks enabled.
		ldr r1,=DEMCR
		ldr r0,[r1]
		orr r0,r0,#0x01000000
		str r0,[r1]            // *SCB_DEMCR |= 0x01000000;
		// End: Added in 2024 :


    	ldr r1, =DWT_BASE

// Disable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		// bic r2,r2,#1      // Disabling CYCCNTENA bit
		ldr r0,=DWT_CTRL_ENABLE_CNTs // Mask for enabling all Counters bits
		bic r2,r2,r0     // Disabling all counter bits
		str r2, [r1,#DWT_CTRL]

// Reset DWT Counters
    	mov r0,#0
		str r0, [r1,#DWT_CYCCNT]
		str r0, [r1,#DWT_CPICNT]
		str r0, [r1,#DWT_EXCCNT]
		str r0, [r1,#DWT_SLPCNT]
		str r0, [r1,#DWT_LSUCNT]
		str r0, [r1,#DWT_FOLDCNT]

	  	pop {r0-r2, pc}

RESET_CNT:
	  	push {r0-r2, lr}

	  	ldr r1, =DWT_BASE


// Disable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		// bic r2,r2,#1      // Disabling CYCCNTENA bit
		ldr r0,=DWT_CTRL_ENABLE_CNTs // Enabling all Counters bits
		bic r2,r2,r0     // Disabling all counter bits
		str r2, [r1,#DWT_CTRL]

// Reset DWT Counters
	  	mov r0,#0
		str r0, [r1,#DWT_CYCCNT]
		str r0, [r1,#DWT_CPICNT]
		str r0, [r1,#DWT_EXCCNT]
		str r0, [r1,#DWT_SLPCNT]
		str r0, [r1,#DWT_LSUCNT]
		str r0, [r1,#DWT_FOLDCNT]

        bl  ENABLE_CNT
	  	pop {r0-r2, pc}

CALC_CNT: // DWT Counter is in r0
	  	push {r1-r6,r8, lr}

    	ldr r1, =DWT_BASE

// Disable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
		// bic r2,r2,#1      // Disabling CYCCNTENA bit
		ldr r3,=DWT_CTRL_ENABLE_CNTs // Enabling all Counters bits
		bic r2,r2,r3     // Disabling all counter bits
		str r2, [r1,#DWT_CTRL]

// instructions executed = DWT_CYCCNT - DWT_CPICNT - DWT_EXCCNT - DWT_SLEEPCNT - DWT_LSUCNT + DWT_FOLDCNT
// Read other DWT Counters

		ldr r2, [r1,#DWT_CPICNT]
		sub r8,r0,r2

		ldr r3, [r1,#DWT_EXCCNT]
		sub r8,r8,r3

		ldr r4, [r1,#DWT_SLPCNT]
		sub r8,r8,r4

		ldr r5, [r1,#DWT_LSUCNT]
		sub r8,r8,r5

		ldr r6, [r1,#DWT_FOLDCNT]
		add r8,r8,r6

		// r8 contains number of instructions
		mov r0,r8

	  	pop {r1-r6,r8, pc}


ENABLE_CNT:
	  	push {r0-r2, lr}

	  	ldr r1, =DWT_BASE

// Enable DWT Counters
		ldr r2, [r1,#DWT_CTRL]
//		orr r2,r2,#1      // Enabling CYCCNTENA bit
		ldr r0,=DWT_CTRL_ENABLE_CNTs // Enabling all Counters bits
		orr r2,r2,r0
		str r2, [r1,#DWT_CTRL]

	  	pop {r0-r2, pc}