AnsweredAssumed Answered

Question about the assemble code for floating point computing: the FPU didn't be used in fact?

Question asked by Lingjun Kong on Aug 3, 2017
Latest reply on Aug 4, 2017 by Clive One

I'm using the STM32F746-Disco to do a floating point computation. the function is here:

void LK_FullyConnect(LK_Accuarcy * W, int W_ROW, int W_COLUM, LK_Accuarcy * X, LK_Accuarcy * C,LK_Accuarcy *bias)
{
for (int row = 0; row < W_ROW; row++)
{
*C=*bias;

for (int num = 0; num < W_COLUM; num++)
{
*C += (*X )*(*W);
X++;
W++;
}
X-=W_COLUM;
C++;
}
}

but when I debugging it, the assemble code is:

383: { 
384:
0x08002B98 E92D5FF0 PUSH {r4-r12,lr}
0x08002B9C ED2D8B02 VPUSH.64 {d8-d8}
0x08002BA0 4606 MOV r6,r0
0x08002BA2 4688 MOV r8,r1
0x08002BA4 4617 MOV r7,r2
0x08002BA6 461C MOV r4,r3
0x08002BA8 E9DD5B0C LDRD r5,r11,[sp,#0x30]
385: for (int row = 0; row < W_ROW; row++)
386: {
387:
0x08002BAC F04F0900 MOV r9,#0x00
0x08002BB0 E031 B 0x08002C16
388: *C=*bias;
389:
0x08002BB2 ED9B0B00 VLDR d0,[r11,#0x00]
0x08002BB6 ED850B00 VSTR d0,[r5,#0x00]
390: for (int num = 0; num < W_COLUM; num++)
391: {
0x08002BBA F04F0A00 MOV r10,#0x00
0x08002BBE E023 B 0x08002C08
392: *C += (*X )*(*W);
393: //
0x08002BC0 ED962B00 VLDR d2,[r6,#0x00]
0x08002BC4 EEB00A42 VMOV.F32 s0,s4
0x08002BC8 EEF00A62 VMOV.F32 s1,s5
0x08002BCC ED942B00 VLDR d2,[r4,#0x00]
0x08002BD0 EEB01A42 VMOV.F32 s2,s4
0x08002BD4 EEF01A62 VMOV.F32 s3,s5
0x08002BD8 EC532B10 VMOV r2,r3,d0
0x08002BDC EC510B11 VMOV r0,r1,d1
0x08002BE0 F7FDFBB5 BL.W __aeabi_dmul (0x0800034E)
0x08002BE4 EC410B18 VMOV d8,r0,r1
0x08002BE8 ED951B00 VLDR d1,[r5,#0x00]
0x08002BEC EEB00A41 VMOV.F32 s0,s2
0x08002BF0 EEF00A61 VMOV.F32 s1,s3
0x08002BF4 EC532B10 VMOV r2,r3,d0
0x08002BF8 F7FDFB02 BL.W __aeabi_dadd (0x08000200)
0x08002BFC E9C50100 STRD r0,r1,[r5,#0]
394: X++;
0x08002C00 3408 ADDS r4,r4,#0x08
395: W++;
396: }
0x08002C02 3608 ADDS r6,r6,#0x08
0x08002C04 F10A0A01 ADD r10,r10,#0x01
0x08002C08 45BA CMP r10,r7
0x08002C0A DBD9 BLT 0x08002BC0
397: X-=W_COLUM;
0x08002C0C EBA404C7 SUB r4,r4,r7,LSL #3
398: C++;
399: }
0x08002C10 3508 ADDS r5,r5,#0x08
385: for (int row = 0; row < W_ROW; row++)
386: {
387:
388: *C=*bias;
389:
390: for (int num = 0; num < W_COLUM; num++)
391: {
392: *C += (*X )*(*W);
393: //
394: X++;
395: W++;
396: }
397: X-=W_COLUM;
398: C++;
399: }
0x08002C12 F1090901 ADD r9,r9,#0x01
0x08002C16 45C1 CMP r9,r8
0x08002C18 DBCB BLT 0x08002BB2
400: }
0x08002C1A ECBD8B02 VPOP.64 {d8-d8}

Please look the code I bloded.:

 

firstly,it load the data from address r6(stored the W parameter) to the FPU.d2 register. this looks well.

0x08002BC0 VLDR d2,[r6,#0x00]: 

 

but following: it moves the data in d2 to d0(s0,s1),

This is my first question: why not just load the data on r6 directly to the FPU.d0 register?

0x08002BC4 VMOV.F32 s0,s4:
0x08002BC8 VMOV.F32 s1,s5:

 

for this part, it load data of address r4(which is actually the X parameter) to d2 again. and move it to d1 repetitively. 
0x08002BCC VLDR d2,[r4,#0x00]
0x08002BD0 VMOV.F32 s2,s4
0x08002BD4 VMOV.F32 s3,s5

 

the following code makes the all program looks stupid. it move the data from d0 and d1 out of FPU, and then using the software floating pointing computing lib aeabi to do the computation.

Why it does not use the VMUL or VMUL.W to do that? this is my second question
0x08002BD8 VMOV r2,r3,d0
0x08002BDC VMOV r0,r1,d1
0x08002BE0 BL.W __aeabi_dmul (0x0800034E)

 

for this part, it load the multiplication back to the FPU form CPU, 
0x08002BE4 VMOV d8,r0,r1

 

and load the r5(C) to d1.
0x08002BE8 VLDR d1,[r5,#0x00]

 

and move the data from d1 to d0, why not just load it to d0?? the same question like question one.
0x08002BEC VMOV.F32 s0,s2
0x08002BF0 VMOV.F32 s1,s3

 

and move the data from d1 to the cpu register. GOSH!
0x08002BF4 VMOV r2,r3,d0

 

Then, using CPU computing the Floating point data.

0x08002BF8 BL.W __aeabi_dadd (0x08000200)
0x08002BFC STRD r0,r1,[r5,#0]

 

 

I can't actually understand what the compiler is thinking about. Is the has any book or manual to help me learn the FPU programming?I've check the ARM help document and arm-v7 manual, FPU instruction reference but can't find anything useful.

 

the IDE i'm using is keil-MDK v5, 2.3, I have already open the using FPU(single precision) and the FPU is enabled in systeminit function. The ARM Compiler I've chosen to use is 'the default compiler version', I don't know what's the exactly version.

Reset_Handler PROC
EXPORT Reset_Handler [WEAK]
IMPORT SystemInit
IMPORT __main

LDR R0, =SystemInit
BLX R0
LDR R0, =__main
BX R0
ENDP

 

void SystemInit(void)
{
/* FPU settings ------------------------------------------------------------*/
#if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
SCB->CPACR |= ((3UL << 10*2)|(3UL << 11*2)); /* set CP10 and CP11 Full Access */
#endif
/* Reset the RCC clock configuration to the default reset state ------------*/
/* Set HSION bit */
RCC->CR |= (uint32_t)0x00000001;

/* Reset CFGR register */
RCC->CFGR = 0x00000000;

/* Reset HSEON, CSSON and PLLON bits */
RCC->CR &= (uint32_t)0xFEF6FFFF;

/* Reset PLLCFGR register */
RCC->PLLCFGR = 0x24003010;

/* Reset HSEBYP bit */
RCC->CR &= (uint32_t)0xFFFBFFFF;

/* Disable all interrupts */
RCC->CIR = 0x00000000;

/* Configure the Vector Table location add offset address ------------------*/
#ifdef VECT_TAB_SRAM
SCB->VTOR = RAMDTCM_BASE | VECT_TAB_OFFSET; /* Vector Table Relocation in Internal SRAM */
#else
SCB->VTOR = FLASH_BASE | VECT_TAB_OFFSET; /* Vector Table Relocation in Internal FLASH */
#endif
}

Thanks for your help.

 

 

by contrast, this is the code when I disable the FPU options in keil:

383: {
384:
0x08002B98 E92D5FFF PUSH {r0-r12,lr}
0x08002B9C 4606 MOV r6,r0
0x08002B9E 4688 MOV r8,r1
0x08002BA0 4617 MOV r7,r2
0x08002BA2 461C MOV r4,r3
0x08002BA4 9D0E LDR r5,[sp,#0x38]
385: for (int row = 0; row < W_ROW; row++)
386: {
387:
0x08002BA6 F04F0900 MOV r9,#0x00
0x08002BAA E023 B 0x08002BF4
388: *C=*bias;
389:
0x08002BAC 990F LDR r1,[sp,#0x3C]
0x08002BAE C903 LDM r1,{r0-r1}
0x08002BB0 E9C50100 STRD r0,r1,[r5,#0]
390: for (int num = 0; num < W_COLUM; num++)
391: {
0x08002BB4 F04F0A00 MOV r10,#0x00
0x08002BB8 E015 B 0x08002BE6
392: *C += (*X )*(*W);
393: //printf_s(" the W is %f X is %f C is %f \r\n",*(W),*(X ),*C);
0x08002BBA E9D6B300 LDRD r11,r3,[r6,#0]
0x08002BBE E9D40100 LDRD r0,r1,[r4,#0]
0x08002BC2 465A MOV r2,r11
0x08002BC4 E9CD0100 STRD r0,r1,[sp,#0]
0x08002BC8 F7FDFBC1 BL.W __aeabi_dmul (0x0800034E)
0x08002BCC E9CD0102 STRD r0,r1,[sp,#0x08]
0x08002BD0 E9D5B300 LDRD r11,r3,[r5,#0]
0x08002BD4 465A MOV r2,r11
0x08002BD6 F7FDFB13 BL.W __aeabi_dadd (0x08000200)
0x08002BDA E9C50100 STRD r0,r1,[r5,#0]
394: X++;
0x08002BDE 3408 ADDS r4,r4,#0x08
395: W++;
396: }
0x08002BE0 3608 ADDS r6,r6,#0x08
0x08002BE2 F10A0A01 ADD r10,r10,#0x01
0x08002BE6 45BA CMP r10,r7
0x08002BE8 DBE7 BLT 0x08002BBA
397: X-=W_COLUM;
0x08002BEA EBA404C7 SUB r4,r4,r7,LSL #3
398: C++;
399: }
0x08002BEE 3508 ADDS r5,r5,#0x08
385: for (int row = 0; row < W_ROW; row++)
386: {
387:
388: *C=*bias;
389:
390: for (int num = 0; num < W_COLUM; num++)
391: {
392: *C += (*X )*(*W);
393: //printf_s(" the W is %f X is %f C is %f \r\n",*(W),*(X ),*C);
394: X++;
395: W++;
396: }
397: X-=W_COLUM;
398: C++;
399: }
0x08002BF0 F1090901 ADD r9,r9,#0x01
0x08002BF4 45C1 CMP r9,r8
0x08002BF6 DBD9 BLT 0x08002BAC
400: }
0x08002BF8 E8BD9FFF POP {r0-r12,pc}

it looks simpler, after all, the data doesn't need to travel to the FPU and finally be computed by CPU

Outcomes