2021-04-28 06:51 AM
Hello!
I'm trying to write the "fastest as possible" code for a 3P3Z (3 pole, 3 zero) controller, but even with SIMD32 instructions, the C code+compiler optimizations seems to win this battle.
Here's the C code:
static __INLINE int16_t q15_cntrl3z3p(q15_3z3pStruct_TypeDef_t *controller)
{
q63_t acc; /* Accumlator */
q31_t fcurr; /* Variaveis temporarias */
// y(k) = u(k) * g0 + u(k-1) * g1 + u(k-2) * g2 + u(k-3) * g3
// - y(k-1) * h1 - y(k-2) * h2 - y(k-3) * h3;
controller->umem[0] = controller->setpoint - controller->input;
acc = (controller->umem[0]*controller->gCoefs[0]<<controller->gShifts[0]);
acc += (controller->umem[1]*controller->gCoefs[1]<<controller->gShifts[1]);
acc += (controller->umem[2]*controller->gCoefs[2]<<controller->gShifts[2]);
acc += (controller->umem[3]*controller->gCoefs[3]<<controller->gShifts[3]);
acc -= (controller->ymem[1]*controller->hCoefs[1]<<controller->hShifts[1]);
acc -= (controller->ymem[2]*controller->hCoefs[2]<<controller->hShifts[2]);
acc -= (controller->ymem[3]*controller->hCoefs[3]<<controller->hShifts[3]);
fcurr = __SSAT(acc>>(controller->QNbase),16);
fcurr = limitador(fcurr,controller->minmax[0],controller->minmax[1]);
controller->ymem[0] = fcurr;
controller->ymem[3] = controller->ymem[2];
controller->ymem[2] = controller->ymem[1];
controller->ymem[1] = controller->ymem[0];
controller->umem[3] = controller->umem[2];
controller->umem[2] = controller->umem[1];
controller->umem[1] = controller->umem[0];
};
And Here's my Mixed ASM version ... *MAY contain bugs*
static __INLINE int16_t q15_cntrl3z3pFAST(q15_3z3pStruct_TypeDef_t *controller)
{
static uint8_t u8_init = 0;
// Mantém as variáveis que apontam para o controlador de forma estática...
static q31_t x0, x1, x2, x3;
static q31_t c0, c1, c2, c3;
q31_t *g_coeffPtr = (q31_t *)&(controller->gCoefs); // É um ponteiro para numeros em q31
q31_t *h_coeffPtr = (q31_t *)&(controller->hCoefs); // É um ponteiro para numeros em q31
q31_t *u_statePtr = (q15_t *)&(controller->umem); // É um ponteiro para números em q15
q31_t *y_statePtr = (q15_t *)&(controller->ymem); // É um ponteiro para números em q15
volatile q63_t sum0, sum1, sum2, sum3;
sum0 = sum1 = sum2 = sum3 = 0;
if (u8_init != 5)
{
u8_init = 5;
x0 = *(q31_t *)(g_coeffPtr); // x0 = 0x00020001
x1 = *(q31_t *)(g_coeffPtr+1);// x1 = 0x00040003
x2 = *(q31_t *)(h_coeffPtr); // x0 = 0x00020001
x3 = *(q31_t *)(h_coeffPtr+1);// x1 = 0x00040003
}
// Só esse bloco toma 250nS
// HAL_GPIO_WritePin(GPIOB, KEY_PB2_Pin, GPIO_PIN_SET);
//HAL_GPIO_WritePin(GPIOB, KEY_PB2_Pin, GPIO_PIN_RESET);
// Só esse bloco toma 250nS
// *****************************************
// Esse bloco ocupa 250ns para execução
// *****************************************
c0 = *(q31_t *)(u_statePtr); // c0 = 0x001e0028
c1 = *(q31_t *)(u_statePtr+1);// c1 = 0x000a0014
c2 = *(q31_t *)(y_statePtr); // c2 = 0x001e0028
c3 = *(q31_t *)(y_statePtr+1);// c3 = 0x000a0014
// *****************************************
// Esse bloco ocupa 250ns para execução
// *****************************************
// *****************************************
// Esse bloco ocupa 500ns para execução
// *****************************************
sum0 = __SMLALD(x0, c0, sum0);
// sum0 = p0*q0 + p1*q1 + sum0;
// sum0 = g0*u0 + g1*u1 + sum0;
sum1 = __SMLALD(x1, c1, sum0);
// sum1 = p2*q2 + p3*q3 + sum0;
// sum1 = g2*u2 + g3*u4 + g0*u0 + g1*u1 + sum0;
// *****************************************
// Esse bloco ocupa 500ns para execução
// *****************************************
sum2 = __SMLALD(x2, c2, sum2);
// sum2 = h0*y0 + h1*y1 + sum1;
sum3 = __SMLALD(x3, c3, sum2);
// sum3 = h2*y2 + h3*y3 + sum2;
// sum3 = g2*u2 + g3*u4 + g0*u0 + g1*u1 + h2*y2 + h3*h3 + h0*y0 + h1*y1;
sum0 = __SSAT((sum1-sum3)>>15, 16);
// *****************************************
// Esse bloco ocupa 500ns para execução
// *****************************************
HAL_GPIO_WritePin(GPIOB, KEY_PB2_Pin, GPIO_PIN_RESET);
}
The C version uses abou 2.0 us , and the ASM (and not so complete version) take 2.2us (+200ns more) and also do not have the output limits... (on a F334 - 72MHz machine).
I'm trying to figure out many ways to NOT write pure ASM/THUMB/ARM, but ... any help?
Thanks.
2021-05-24 12:36 AM
Just dont USE HAL GPIO
2021-10-16 09:19 PM
the point is, HAL is used on both codes and yet there is an increase of 200nS.
That is the question.
Why not use G474 instead?