2025-03-28 7:00 AM
Hello all,
I have been working on a bare-metal implementation of the PKA engine in the STM32WBA55CG microcontroller using Rust. So far, I have been able to run modular addition and subtraction, modular exponentiation, arithmetic operations, ECC point check, ECC Multiplication and ECDSA sign and verify. However, there is an issue with the modular multiplication. Whenever I try to multiply operand_a * operand_b mod N, I get zero as a result (or some incorrect value). According to the datasheet:
Simple modular multiplication A x B mod n
a) Compute r2modn using Montgomery parameter computation.
b) Compute AR = A x r2modn mod n. Output is in the Montgomery domain.
c) Compute AB= AR x B mod n. Output is in natural domain
And the memory addresses are as follow:
I have tried two approaches:
1. Loading into the memory operand_a and operand_b and perform the multiplication using mode 0x10
2. Perform the steps mentioned above, i.e., compute AR, and then AB using AR.
In both cases, I get a zero result in the end. Below you can find my code. and the result I get:
#![no_std]
#![no_main]
// Test vectors: https://github.com/scogliani/ecc-test-vectors?tab=readme-ov-file
// Reference Manual: file:///C:/Users/elopezpe/OneDrive/Documentos/PhD/micro/stm32eba55cg/rm0493-multiprotocol-wireless-bluetooth-low-energy-and-ieee802154-stm32wba5xxx-arm-based-32-bit-mcus-stmicroelectronics-en.pdf
use stm32wba::stm32wba55::{self};
use {defmt_rtt as _, panic_probe as _};
use cortex_m_rt::entry;
use cortex_m::asm;
use defmt::info;
use core::{
mem::size_of,
ptr::{read_volatile, write_volatile},
};
const BASE: usize = 0x520C_2000;
const PKA_RAM_OFFSET: usize = 0x400;
const RAM_BASE: usize = BASE + PKA_RAM_OFFSET;
const MODE: u8 = 0x10;
const RAM_NUM_DW: usize = 667;
// PKA RAM locations for multiplication
const OPERAND_LENGTH_OFFSET: usize = BASE + 0x408 ;
const OPERAND_A_OFFSET: usize = BASE + 0xA50;
const OPERAND_B_OFFSET: usize = BASE + 0xC68;
const MODULUS_OFFSET: usize = BASE + 0x1088;
const RESULT_OFFSET: usize = BASE + 0xE78;
const MONTGOMERY_OFFSET: usize = BASE + 0x620;
const N: [u32; 8] = [
0xffffffff, 0x00000001, 0x00000000, 0x00000000,
0x00000000, 0xffffffff, 0xffffffff, 0xffffffff,
];
const A: [u32; 8] = [
0xffffffff, 0x00000001, 0x00000000, 0x00000000,
0x00000000, 0xffffffff, 0xffffffff, 0xfffffffe,
];
const B: [u32; 8] = [
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000002,
];
const R2MODN: [u32; 8] = [
0xFFFFFFFC, 0xFFFFFFFC, 0xFFFFFFFB, 0xFFFFFFF9,
0xFFFFFFFE, 0x00000003, 0x00000005, 0x00000002
];
const OPERAND_LENGTH: u32 = 8 * 32;
const WORD_LENGTH: usize = 8; //(OPERAND_LENGTH as usize)/32;
unsafe fn write_ram(offset: usize, buf: &[u32]) {
debug_assert_eq!(offset % 4, 0);
debug_assert!(offset + buf.len() * size_of::<u32>() < 0x520C_33FF);
buf.iter().rev().enumerate().for_each(|(idx, &dw)| {
write_volatile((offset + idx * size_of::<u32>()) as *mut u32, dw)
});
}
unsafe fn read_ram(offset: usize, buf: &mut [u32]) {
debug_assert_eq!(offset % 4, 0);
debug_assert!(offset + buf.len() * size_of::<u32>() < 0x520C_33FF);
buf.iter_mut().rev().enumerate().for_each(|(idx, dw)| {
*dw = read_volatile((offset + idx * size_of::<u32>()) as *const u32);
});
}
unsafe fn zero_ram() {
(0..RAM_NUM_DW)
.into_iter()
.for_each(|dw| unsafe { write_volatile((dw * 4 + RAM_BASE) as *mut u32, 0) });
}
#[entry]
unsafe fn main() -> ! {
let p = stm32wba55::Peripherals::take().unwrap();
let pka = &p.PKA;
let clock = &p.RCC;
let rng = &p.RNG;
// Enable HSI as a stable clock source
clock.rcc_cr().modify(|_, w| w
.hseon().set_bit()
);
while clock.rcc_cr().read().hserdy().bit_is_clear() {
asm::nop();
}
// Enable RNG clock. Select the source clock. Select the AHB clock
clock.rcc_ccipr2().write(|w| w.rngsel().b_0x2());
clock.rcc_ahb2enr().modify(|_, w| w.rngen().set_bit());
while clock.rcc_ahb2enr().read().rngen().bit_is_clear() {
asm::nop();
}
// Configure RNG
// To configure, CONDRST bit is set to 1 in the same access and CONFIGLOCK remains at 0
rng.rng_cr().write(|w| w
.rngen().clear_bit()
.condrst().set_bit()
.configlock().clear_bit()
.nistc().clear_bit() // Hardware default values for NIST compliant RNG
.ced().clear_bit() // Clock error detection enabled
);
// First clear CONDRST while keeping RNGEN disabled
rng.rng_cr().modify(|_, w| w
.condrst().clear_bit()
);
// Then enable RNG in a separate step
rng.rng_cr().modify(|_, w| w
.rngen().set_bit()
.ie().set_bit()
);
while rng.rng_sr().read().drdy().bit_is_clear() {
asm::nop();
}
info!("RNG enabled successfully");
// Enable PKA peripheral clock via RCC_AHB2ENR register
clock.rcc_ahb2enr().modify(|_, w| w.pkaen().set_bit());
// Reset PKA before enabling (sometimes helps with initialization)
pka.pka_cr().modify(|_, w| w.en().clear_bit());
for _ in 0..10 {
asm::nop();
}
// Enable PKA peripheral
pka.pka_cr().write(|w| w
.en().set_bit()
.mode().bits(MODE)
);
// Wait for PKA to initialize
while pka.pka_sr().read().initok().bit_is_clear() {
asm::nop();
}
info!("PKA initialized successfully!");
// Clear any previous error flags
pka.pka_clrfr().write(|w| w
.addrerrfc().set_bit()
.ramerrfc().set_bit()
.procendfc().set_bit()
);
// First compute AR = A x r2modn mod n
zero_ram();
write_ram(OPERAND_LENGTH_OFFSET, &[OPERAND_LENGTH]);
write_ram(OPERAND_A_OFFSET, &A);
write_ram(OPERAND_B_OFFSET, &R2MODN);
write_ram(MODULUS_OFFSET, &N);
// Configure PKA operation mode and start
info!("Starting PKA operation...");
pka.pka_cr().modify(|_, w| w
.mode().bits(MODE)
.start().set_bit() // Start the operation
);
// Wait for processing to complete - PROCENDF is 1 when done
info!("Waiting for operation to complete...");
while pka.pka_sr().read().procendf().bit_is_clear() {
asm::nop();
}
info!("Operation complete!");
// Add error checking after PKA operations
if pka.pka_sr().read().addrerrf().bit_is_set() {
info!("Address Error detected");
} else if pka.pka_sr().read().ramerrf().bit_is_set() {
info!("RAM Error detected");
} else {
info!("No errors");
}
// Read the result
let mut AR = [0u32; WORD_LENGTH];
read_ram(RESULT_OFFSET, &mut AR);
info!("AR = A({:#X}) * R2MODN({:#X}) (mod {:#X}) = {:#X}", A, R2MODN, N, AR);
// Clear the completion flag
pka.pka_clrfr().write(|w| w.procendfc().set_bit());
// Compute AB= AR x B mod n
zero_ram();
write_ram(OPERAND_LENGTH_OFFSET, &[OPERAND_LENGTH]);
write_ram(OPERAND_A_OFFSET, &AR);
write_ram(OPERAND_B_OFFSET, &B);
write_ram(MODULUS_OFFSET, &N);
// Configure PKA operation mode and start
info!("Starting PKA operation...");
pka.pka_cr().modify(|_, w| w
.mode().bits(MODE)
.start().set_bit() // Start the operation
);
// Wait for processing to complete - PROCENDF is 1 when done
info!("Waiting for operation to complete...");
while pka.pka_sr().read().procendf().bit_is_clear() {
asm::nop();
}
info!("Operation complete!");
// Read the result
let mut result = [0u32; WORD_LENGTH];
read_ram(RESULT_OFFSET, &mut result);
info!("AB = AR({:#X}) * B({:#X}) (mod {:#X}) = {:#X}", AR, B, N, result);
// Clear the completion flag
pka.pka_clrfr().write(|w| w.procendfc().set_bit());
loop {}
}
INFO RNG enabled successfully
INFO PKA initialized successfully!
INFO Starting PKA operation...
INFO Waiting for operation to complete...
INFO Operation complete!
INFO No errors
INFO AR = A([0xFFFFFFFF, 0x1, 0x0, 0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE]) * R2MODN([0xFFFFFFFC, 0xFFFFFFFC, 0xFFFFFFFB, 0xFFFFFFF9, 0xFFFFFFFE, 0x3, 0x5, 0x2]) (mod [0xFFFFFFFF, 0x1, 0x0, 0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF]) = [0x0, 0xFFFFFFFC, 0x0, 0xFFFFFFFB, 0xFFFFFFFB, 0x0, 0xFFFFFFFB, 0xFFFFFFFC]
INFO Starting PKA operation...
INFO Waiting for operation to complete...
INFO Operation complete!
INFO AB = AR([0x0, 0xFFFFFFFC, 0x0, 0xFFFFFFFB, 0xFFFFFFFB, 0x0, 0xFFFFFFFB, 0xFFFFFFFC]) * B([0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2]) (mod [0xFFFFFFFF, 0x1, 0x0, 0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF]) = [0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0]
R2MODN has been calculated using the corresponding operation in the PKA engine for modulus N in P-256 curve.
Any help will be appreciated. Thanks!