cancel
Showing results for 
Search instead for 
Did you mean: 

Speeding up complex computation

megahercas6
Senior
Posted on October 28, 2014 at 19:00

For my university i am making camera based laser spot position detector.

By simply making mass centre calculation i can get position of laser spot on ccd camera. Problem is, array is large 1024x1024x16b, and computations take a bit too long. With maximum optimization, i can get 7FPS , it would be extreamle good to get 20-30 calculations per second. for now, data is inside SRAM, and DCMI don't do anything.

uint32_t y_eilutes[1024],x_eilutes[1024];
uint16_t data[1024];
float pozicija_x = 0.0f,pozicija_y=0.0f;
uint32_t xx=0,yy=0,pointer = 0;
uint32_t sum_x = 0,sum_y=0;
float coef_x = 0,coef_y=0;
void pozicija(void)
{
xx=0,yy=0;
pointer = 0;
sum_x = 0;
sum_y = 0;
coef_x = 0;
coef_y = 0;
while(yy<1024)
{
while(xx<1024)
{
data[xx]=(*(__IO uint16_t*) (SRAM_BANK_ADDR + pointer));
pointer+=2;
xx++;
}
xx=0;
while(xx<1024)
{
x_eilutes[xx]+=data[xx];
y_eilutes[yy]+=data[xx];
xx++;
}
xx=0;
yy++;
}
xx=0; yy=0;
while(xx<1024)
{
sum_x+=x_eilutes[xx];
sum_y+=y_eilutes[xx];
coef_x+=(float)xx*x_eilutes[xx];
coef_y+=(float)xx*y_eilutes[xx];
xx++;
}
pozicija_x=(float)(coef_x/sum_x);
pozicija_y=(float)(coef_y/sum_y);
calc_x=(uint16_t)pozicija_x;
calc_y=(uint16_t)pozicija_y;
xx=0;
while(xx<1024)
{
x_eilutes[xx]=0;
y_eilutes[xx]=0;
xx++;
}
}

so this is my startingpoint, and it does work fine. Now, since i am not good programmer, maybe some can spot how can i speed up this ? ( last thing will be rewriting parts of code in assembler) 1) First is obvious , i need copy data to internal memory by using DMA or DMA2D. since DMA2D is a bit simpler, i rewrited code of copying line data to internal SRAM

this part:
while(xx<1024)
{
data[xx]=(*(__IO uint16_t*) (SRAM_BANK_ADDR + pointer));
pointer+=2;
xx++;
}
changed to :
void DMA2D_Config(uint32_t offset)
{
DMA2D_InitTypeDef DMA2D_InitStruct;
DMA2D_FG_InitTypeDef DMA2D_FG_InitStruct;
/* Enable the DMA2D Clock */
RCC_AHB1PeriphClockCmd(RCC_AHB1Periph_DMA2D, ENABLE);
/* DMA2D configuration */
//DMA2D_DeInit();
/* Transfer mode */
DMA2D_InitStruct.DMA2D_Mode = DMA2D_M2M;
/* Color mode */
DMA2D_InitStruct.DMA2D_CMode = DMA2D_RGB565;
/* Output Address */
DMA2D_InitStruct.DMA2D_OutputMemoryAdd = (uint32_t) &data[0];
/* Output Offset */ 
DMA2D_InitStruct.DMA2D_OutputOffset = 0;
DMA2D_InitStruct.DMA2D_NumberOfLine = 1;
DMA2D_InitStruct.DMA2D_PixelPerLine = 1024;
/* Initialize the alpha and RGB values */
DMA2D_InitStruct.DMA2D_OutputGreen = 0;
DMA2D_InitStruct.DMA2D_OutputBlue = 0;
DMA2D_InitStruct.DMA2D_OutputRed = 0;
DMA2D_InitStruct.DMA2D_OutputAlpha = 0;
/* Initialize the output offset */
DMA2D_InitStruct.DMA2D_OutputOffset = 0;
/* Initialize DMA2D */
DMA2D_Init(&DMA2D_InitStruct);
DMA2D_FG_StructInit(&DMA2D_FG_InitStruct);
DMA2D_FG_InitStruct.DMA2D_FGCM = DMA2D_RGB565;
DMA2D_FG_InitStruct.DMA2D_FGMA = SRAM_BANK_ADDR+offset*2;
DMA2D_FGConfig(&DMA2D_FG_InitStruct);
}
and inside while loop:
while(yy<1024)
{
DMA2D_Config(yy*1024);
DMA2D_StartTransfer();
while(DMA2D_GetFlagStatus(DMA2D_FLAG_TC) == RESET);
while(xx<1024)
{
x_eilutes[xx]+=data[xx];
y_eilutes[yy]+=data[xx];
xx++;
}
xx=0;
yy++;
}

At this point, i don't know how i can speed up more code by using plain C, any ideas ? DMA2D did give boost from 7FPS to 12,5FPS, but i still need to double calculation speed. 2) since i am using STM32F429, i will replace with arm cortex A7 for better performance, bus this can be done only next year
10 REPLIES 10
megahercas6
Senior
Posted on October 30, 2014 at 10:25

no, this cut performance by half, from 18,8FPS to 9,4477FPS

( it is overclocked to 225MHz) what did helped boost performance is temp variable inside loop.

temp=0;
while(xx<1024)
{
x_eilutes[xx]+=data[xx];
temp+=data[xx];
xx++;
}
y_eilutes[yy]=temp;

and right now finally i can get optimize compiler for speed with no code limit, because in the beginning, type based alias analysis gave me errors in transfers and computations, and now it is boosting efficiency from 18FPS to 22FPS. at this point only rewriting to asm could boost performance. Also i was unable to find dma2d pointer location so i can't do calculations while i am loading data. i will try to add half full watermark interrupt, and do half buffer calculations while other half is loading by DMA2D

uint32_t y_eilutes[1024],x_eilutes[1024];
float pozicija_x = 0.0f,pozicija_y=0.0f;
float coef_x = 0,coef_y=0;
uint16_t xx=0,yy=0;
void pozicija(void)
{
register uint32_t sum=0;
xx=0,yy=0;
coef_x = 0;
coef_y = 0;
sum=0;
while(yy<1024)
{
DMA2D_Config(yy*1024);
DMA2D_StartTransfer();
uint32_t temp = 0;
while(DMA2D_GetFlagStatus(DMA2D_FLAG_TC) == RESET);
while(xx<1024)
{
sum+=data[xx];
x_eilutes[xx]+=data[xx];
temp+=data[xx];
xx++;
}
y_eilutes[yy]=temp;
xx=0;
yy++;
}
xx=0; yy=0;
while(xx<1024)
{
coef_x+=(float)xx*x_eilutes[xx];
coef_y+=(float)xx*y_eilutes[xx];
xx+=1;
}
pozicija_x=(coef_x/(sum));
pozicija_y=(coef_y/(sum));
calc_x=(uint16_t)pozicija_x;
calc_y=(uint16_t)pozicija_y;
xx=0;
while(xx<1024)
{
x_eilutes[xx]=0;
xx++;
}
}