FMAC Cascade for SOS

Marc_Hochuli · ‎2026-03-05

Hi Together,

I am currently trying to use the FMAC hardware accelerator. Normal usage already works. Now for numerical stability I want to implement the filter as second order sections (sos). According to RM0481:

Cascaded filters must either be combined into a single stage, or implemented as
separate filters. In the latter case, multiple sets of filter coefficients can be pre-loaded
into the memory, one set per stage, and only the X2_BASE address changed to select
which set is used.

To do this I currently:

Concatenate all the stage coeffs into one vector:

static int16_t const b_Coeffs_0[b_Coeff_len_0] = { 55, 111, 55 };
static int16_t const a_Coeffs_0[a_Coeff_len_0] = { -3745, -258 };
static int16_t const b_Coeffs_1[b_Coeff_len_1] = { 16384, 32767, 16384 };
...

static int16_t all_coeffs[sos_Coeff_Count] = {
b_Coeffs_0[0], b_Coeffs_0[1], b_Coeffs_0[2],
a_Coeffs_0[0],
...

On setup input the vector as b coefficients:

filter_Config.Clip = FMAC_CLIP_ENABLED;
filter_Config.Filter = FMAC_FUNC_IIR_DIRECT_FORM_1;

filter_Config.pCoeffA = NULL;
filter_Config.CoeffASize = 0;

filter_Config.pCoeffB = all_coeffs;
filter_Config.CoeffBSize = sos_Coeff_Count;

filter_Config.CoeffBaseAddress = 0UL;
filter_Config.CoeffBufferSize = sos_Coeff_Count; //X2 in reference manual

filter_Config.OutputBaseAddress = b_Coeff_len_0 + sos_Coeff_Count + input_headroom;
filter_Config.OutputBufferSize = a_Coeff_len_0 + output_headroom; //Y in reference manual
filter_Config.OutputAccess = FMAC_BUFFER_ACCESS_DMA;

filter_Config.InputBaseAddress = sos_Coeff_Count + 1;
filter_Config.InputBufferSize = b_Coeff_len_0 + input_headroom;	//X1 in reference manual
filter_Config.InputAccess = FMAC_BUFFER_ACCESS_DMA;

filter_Config.P = b_Coeff_len_0;
filter_Config.Q = a_Coeff_len_0;
filter_Config.R = fmac_Gain;

And then calculate my entire 15k data vector on one stage. After that I increment the X2 base vector:

MODIFY_REG(hfmac.Instance->X2BUFCFG,
	           FMAC_X2BUFCFG_X2_BASE,
	           ((uint32_t)(filter_Config.CoeffBaseAddress + coeff_per_stage * stage) << FMAC_X2BUFCFG_X2_BASE_Pos)
	           & FMAC_X2BUFCFG_X2_BASE);

I copy data from one iteration to the next this way:

memcpy(fData.x_q15, fData.y_q15, fData.yLen * sizeof(int16_t));
				fData.xLen = fData.yLen;
				fmacAppendData(fData.x_q15, &fData.xLen, fData.y_q15, &fData.yLen, &fMetadata.fCalcFinished);

Where:

HAL_StatusTypeDef fmacAppendData(int16_t *dataIn, uint16_t *inLen, int16_t *dataOut, uint16_t *outLen, volatile bool *pCalcFinished) {
	calcFinished = pCalcFinished;
	HAL_StatusTypeDef status = HAL_FMAC_AppendFilterData(&hfmac, dataIn, inLen);
	if (status != HAL_OK) {
		return status;
	}
	if (isInitialData) {
		isInitialData = false;
	} else {
		// For explanation on why output length is halved, see function fmacStart(...)
		uint16_t halfLen = (*outLen) / 2;
		status = HAL_FMAC_ConfigFilterOutputBuffer(&hfmac, dataOut, &halfLen);
		if (status != HAL_OK) {
			return status;
		}
	}
	return HAL_OK;
}

The append function and everything around it is already tested and works for non-cascaded filters. Only when i try to cascade it wont work. (The output looks like gibberish)

Unfortunately I cant find any application note or something more except this sentence in the reference manual regarding how to cascade filters this way.

Thanks in advance for any useful hints.

Marc_Hochuli · ‎2026-03-11

Hello @Saket_Om

Thanks for your reply. By now I made it work. I configured as follows:

I preload all values into the coeffB vector:

#define sos_stage_a_coeff_len 2
#define sos_stage_b_coeff_len 3
static const int16_t sos_b_Coeffs_0[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_0[sos_stage_a_coeff_len] = { -3883, -325 };
static const int16_t sos_b_Coeffs_1[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_1[sos_stage_a_coeff_len] = { -4579, -1208 };
static const int16_t sos_b_Coeffs_2[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_2[sos_stage_a_coeff_len] = { -6270, -3277 };
static const int16_t sos_b_Coeffs_3[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_3[sos_stage_a_coeff_len] = { -9973, -7521 };
#define sos_Stage_Count 4
#define sos_Coeff_Count 20
static uint8_t sos_fmac_Gain = 1;

static int16_t sos_Coeffs[sos_Coeff_Count] = {
    sos_b_Coeffs_0[0], sos_b_Coeffs_0[1], sos_b_Coeffs_0[2],
    sos_a_Coeffs_0[0], sos_a_Coeffs_0[1],

    sos_b_Coeffs_1[0], sos_b_Coeffs_1[1], sos_b_Coeffs_1[2],
    sos_a_Coeffs_1[0], sos_a_Coeffs_1[1],

    sos_b_Coeffs_2[0], sos_b_Coeffs_2[1], sos_b_Coeffs_2[2],
    sos_a_Coeffs_2[0], sos_a_Coeffs_2[1],

    sos_b_Coeffs_3[0], sos_b_Coeffs_3[1], sos_b_Coeffs_3[2],
    sos_a_Coeffs_3[0], sos_a_Coeffs_3[1],
};

filter_Config.Clip = FMAC_CLIP_ENABLED;
	filter_Config.Filter = FMAC_FUNC_IIR_DIRECT_FORM_1;

	filter_Config.pCoeffA = NULL;
	filter_Config.CoeffASize = 0;
	filter_Config.pCoeffB = sos_Coeffs;
	filter_Config.CoeffBSize = sos_Coeff_Count;

	filter_Config.CoeffBaseAddress = 0UL;
	filter_Config.CoeffBufferSize = sos_Coeff_Count;							//X2 in reference manual

	filter_Config.InputBaseAddress = sos_Coeff_Count;
	filter_Config.InputBufferSize = sos_stage_b_coeff_len + input_headroom;		//X1 in reference manual
	filter_Config.InputAccess = FMAC_BUFFER_ACCESS_DMA;

	filter_Config.OutputBaseAddress = sos_Coeff_Count + sos_stage_b_coeff_len + input_headroom;
	filter_Config.OutputBufferSize = sos_stage_a_coeff_len + output_headroom;	//Y in reference manual
	filter_Config.OutputAccess = FMAC_BUFFER_ACCESS_DMA;

	filter_Config.P = sos_stage_b_coeff_len;
	filter_Config.Q = sos_stage_a_coeff_len;
	filter_Config.R = sos_fmac_Gain;

This works, since apparently coeffBSize is only needed for the transfer, and P and Q are responsible for the actual calculation.

HAL_StatusTypeDef fmacSOSStageSetup(uint8_t stage) {
	//@formatter:off
	MODIFY_REG(hfmac.Instance->X2BUFCFG,
	           FMAC_X2BUFCFG_X2_BASE,
	           ((uint32_t)((sos_stage_a_coeff_len + sos_stage_b_coeff_len) * stage) << FMAC_X2BUFCFG_X2_BASE_Pos)
	           & FMAC_X2BUFCFG_X2_BASE);
	//@formatter:on
	return HAL_OK;
}

And after that i increment the X2 base that way. This works right now (Compared to the reference i calculate with
python)

View solution in original post

Saket_Om · ‎2026-03-11

Hello @Marc_Hochuli

If I understand your usecase well.
You have an IIR filter composed of n stages of 3 'b' coefs & 2 'a' coefs like this [b10, b11, b12, a10, a11]
Your first algorithm is:
IIR_STAGES_NUMBER /* Number of IIR stages */
IIR_STAGE_COEF_B_SIZE 3U /* Number of coefficient B in a stage */
IIR_STAGE_COEF_A_SIZE 2U /* Number of coefficient A in a stage */
IIR_STAGE_COEF_SIZE (IIR_STAGE_COEF_B_SIZE + IIR_STAGE_COEF_A_SIZE)
For each serie of samples
copy samples into local X_Buffer
for each stage
Configure the FMAC with this configuration:

filter_Config.Clip = FMAC_CLIP_ENABLED;
      filter_Config.Filter = FMAC_FUNC_IIR_DIRECT_FORM_1;
      filter_Config.pCoeffA = stage_coefA;
      filter_Config.CoeffASize = IIR_STAGE_COEF_A_SIZE;



      filter_Config.pCoeffB = stage_coefB;
      filter_Config.CoeffBSize = IIR_STAGE_COEF_B_SIZE;



      filter_Config.CoeffBaseAddress = 0UL;
      filter_Config.CoeffBufferSize = IIR_STAGE_COEF_SIZE; //X2 in reference manual



      filter_Config.OutputBaseAddress = IIR_STAGE_COEF_SIZE + INPUT_SAMPLES_SIZE + 1;
      filter_Config.OutputBufferSize = OUTPUT_SAMPLES_SIZE; //Y in reference manual
      filter_Config.OutputAccess = FMAC_BUFFER_ACCESS_DMA;



      filter_Config.InputBaseAddress = IIR_STAGE_COEF_SIZE + 1;
      filter_Config.InputBufferSize = INPUT_SAMPLES_SIZE;  //X1 in reference manual
      filter_Config.InputAccess = FMAC_BUFFER_ACCESS_DMA;



      filter_Config.P = IIR_STAGE_COEF_B_SIZE;
      filter_Config.Q = IIR_STAGE_COEF_A_SIZE;
      filter_Config.R = fmac_Gain;

    HAL_FMAC_FilterConfig_DMA(hfmac, &filter_config);

Start the process with HAL_FMAC_FilterStart

Move results Y into X_Buffer for next stage
...

And you would like to preload all IIR stages into the FMAC and select each stage by changing the X2_BASE address, right ?

Well, the HAL driver is not implemented to do that.
As you can show in the HAL_FMAC_FilterConfigTypeDef structure, the driver requires 2 pointers on coefficients buffers, One for A Coefs and One for B Coefs.
When analyzing your code, your configuration is wrong because you don't initialize the structure as expected by the driver.
Then if you activate the assertion, logically, you should have an assertion at line 1791 because the driver requires a non-null pointer for CoefA in case of IIR filter.

But I think your issue is not there, but in the configuration of the X2BUFCFG register.
Indeed, HAL_FMAC_FilterConfig_DMA configures the "X2_BUF_SIZE" bitfield with the value of filter_config.CoeffBufferSize (see line 1722 in the driver).
In your usecase, this size is set with the value of "sos_Coeff_Count" which is the global size of all coefficients buffers, while it should be the size of only one coefficient buffer.

Your results are gibberish because for each stage, each input sample is filtered by IIR_STAGES_NUMBER * IIR_STAGE_COEF_SIZE coefficients and not by the expected stage...

Then after calling HAL_FMAC_FilterConfig_DMA, I recommend you to check the coeff size in X2BUFCFG, and change it by :

MODIFY_REG(hfmac.Instance->X2BUFCFG, FMAC_X2BUFCFG_X2_BUF_SIZE, IIR_STAGE_COEF_SIZE << FMAC_X2BUFCFG_X2_BUF_SIZE_Pos);

To give better visibility on the answered topics, please click on "Accept as Solution" on the reply which solved your issue or answered your question.
Saket_Om

Marc_Hochuli · ‎2026-03-11

Hello @Saket_Om

Thanks for your reply. By now I made it work. I configured as follows:

I preload all values into the coeffB vector:

#define sos_stage_a_coeff_len 2
#define sos_stage_b_coeff_len 3
static const int16_t sos_b_Coeffs_0[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_0[sos_stage_a_coeff_len] = { -3883, -325 };
static const int16_t sos_b_Coeffs_1[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_1[sos_stage_a_coeff_len] = { -4579, -1208 };
static const int16_t sos_b_Coeffs_2[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_2[sos_stage_a_coeff_len] = { -6270, -3277 };
static const int16_t sos_b_Coeffs_3[sos_stage_b_coeff_len] = { 6291, 12583, 6291 };
static const int16_t sos_a_Coeffs_3[sos_stage_a_coeff_len] = { -9973, -7521 };
#define sos_Stage_Count 4
#define sos_Coeff_Count 20
static uint8_t sos_fmac_Gain = 1;

static int16_t sos_Coeffs[sos_Coeff_Count] = {
    sos_b_Coeffs_0[0], sos_b_Coeffs_0[1], sos_b_Coeffs_0[2],
    sos_a_Coeffs_0[0], sos_a_Coeffs_0[1],

    sos_b_Coeffs_1[0], sos_b_Coeffs_1[1], sos_b_Coeffs_1[2],
    sos_a_Coeffs_1[0], sos_a_Coeffs_1[1],

    sos_b_Coeffs_2[0], sos_b_Coeffs_2[1], sos_b_Coeffs_2[2],
    sos_a_Coeffs_2[0], sos_a_Coeffs_2[1],

    sos_b_Coeffs_3[0], sos_b_Coeffs_3[1], sos_b_Coeffs_3[2],
    sos_a_Coeffs_3[0], sos_a_Coeffs_3[1],
};

filter_Config.Clip = FMAC_CLIP_ENABLED;
	filter_Config.Filter = FMAC_FUNC_IIR_DIRECT_FORM_1;

	filter_Config.pCoeffA = NULL;
	filter_Config.CoeffASize = 0;
	filter_Config.pCoeffB = sos_Coeffs;
	filter_Config.CoeffBSize = sos_Coeff_Count;

	filter_Config.CoeffBaseAddress = 0UL;
	filter_Config.CoeffBufferSize = sos_Coeff_Count;							//X2 in reference manual

	filter_Config.InputBaseAddress = sos_Coeff_Count;
	filter_Config.InputBufferSize = sos_stage_b_coeff_len + input_headroom;		//X1 in reference manual
	filter_Config.InputAccess = FMAC_BUFFER_ACCESS_DMA;

	filter_Config.OutputBaseAddress = sos_Coeff_Count + sos_stage_b_coeff_len + input_headroom;
	filter_Config.OutputBufferSize = sos_stage_a_coeff_len + output_headroom;	//Y in reference manual
	filter_Config.OutputAccess = FMAC_BUFFER_ACCESS_DMA;

	filter_Config.P = sos_stage_b_coeff_len;
	filter_Config.Q = sos_stage_a_coeff_len;
	filter_Config.R = sos_fmac_Gain;

This works, since apparently coeffBSize is only needed for the transfer, and P and Q are responsible for the actual calculation.

HAL_StatusTypeDef fmacSOSStageSetup(uint8_t stage) {
	//@formatter:off
	MODIFY_REG(hfmac.Instance->X2BUFCFG,
	           FMAC_X2BUFCFG_X2_BASE,
	           ((uint32_t)((sos_stage_a_coeff_len + sos_stage_b_coeff_len) * stage) << FMAC_X2BUFCFG_X2_BASE_Pos)
	           & FMAC_X2BUFCFG_X2_BASE);
	//@formatter:on
	return HAL_OK;
}

And after that i increment the X2 base that way. This works right now (Compared to the reference i calculate with
python)