2025-09-25 1:08 PM
Hi,
I would like to check if there is any one who could guide me through the jungle of using TFLM in my software. It seems that it don't matter what I do when building a TFLM as a static library and link it to my simple test program. I have tried a lot of configurations and a lot of tries to 16 byte align the model and arena for this, but I am still getting a hard fault because of unaligned access in the AllocateTensors call.
My align of the model looks like this:
alignas(16) const unsigned char g_model_data[] = ...
and my code using TFLM looks as follows:
#include <cstring>
//#include "tensorflow/lite/micro/micro_op_resolver.h"
#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
//#include "tensorflow/lite/micro/kernels/micro_ops.h"
//#include "tensorflow/lite/micro/micro_log.h"
//#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "tensorflow/lite/micro/system_setup.h"
#include "crack_model_quant.h"
#include "test_signals.h"
#include "TFRunner.h"
static inline bool is_aligned(const void* p, size_t n) {
return (reinterpret_cast<uintptr_t>(p) & (n - 1)) == 0;
}
// Round a pointer up to the next N-byte boundary (N must be power of 2)
template <size_t N>
static inline uint8_t* align_ptr(uint8_t* p) {
return reinterpret_cast<uint8_t*>(
(reinterpret_cast<uintptr_t>(p) + (N - 1)) & ~static_cast<uintptr_t>(N - 1));
}
// If you can’t regenerate easily, you can point to the next aligned address:
// const unsigned char* g_model = reinterpret_cast<const unsigned char*>(align_ptr<16>(const_cast<uint8_t*>(g_model_data)));
// const int g_model_len = g_model_data_len - (g_model - g_model_data);
// const unsigned char* g_model = g_model_data;
// const int g_model_len = g_model_data_len;
// Globals, used for compatibility with Arduino-style sketches.
namespace {
const tflite::Model* model = nullptr;
tflite::MicroInterpreter* interpreter = nullptr;
//static tflite::MicroErrorReporter micro_error_reporter;
int inference_count = 0;
//double scale = 0.032983943819999695;
//int zero_point = -1;
// float qScale = 0.032983943819999695;
// int8_t qZero = -1;
// constexpr int kTensorArenaSize = 100 * 1024;
// uint8_t tensor_arena[kTensorArenaSize];
// Choose a size that fits your model; increase if you see kTfLiteArenaRw overflow
constexpr size_t kArenaSize = 70 * 1024;
// Over-allocate and then align the working pointer to 16 bytes
// #if defined(__GNUC__)
// __attribute__((aligned(16)))
// #endif
alignas(16) static uint8_t g_arena_raw[kArenaSize + 16];
static uint8_t* g_arena = align_ptr<16>(g_arena_raw);
static size_t g_arena_len = kArenaSize; // keep logical size the same
} // namespace
TfLiteTensor* input = nullptr;
TfLiteTensor* output = nullptr;
float in_scale;
int in_zero;
int TFRunner::run()
{
// Enable the DBGMCU clock
__HAL_RCC_DBGMCU_CLK_ENABLE();
// Enable the debugger during Sleep mode
HAL_DBGMCU_EnableDBGSleepMode();
// Enable the debugger during Stop mode
HAL_DBGMCU_EnableDBGStopMode();
// Enable the debugger during Standby mode
HAL_DBGMCU_EnableDBGStandbyMode();
alignas(16) static uint8_t g_model_aligned[g_model_data_len];
memcpy(g_model_aligned, g_model_data, g_model_data_len);
const unsigned char* g_model = g_model_aligned;
const int g_model_len = g_model_data_len;
// Your code here
tflite::InitializeTarget();
// Map the model into a usable data structure. This doesn't involve any
// copying or parsing, it's a very lightweight operation.
model = tflite::GetModel(g_model);
if (model->version() != TFLITE_SCHEMA_VERSION) {
MicroPrintf(
"Model provided is schema version %d not equal "
"to supported version %d.",
model->version(), TFLITE_SCHEMA_VERSION);
return -1;
}
// This pulls in all the operation implementations we need.
// NOLINTNEXTLINE(runtime-global-variables)
static tflite::MicroMutableOpResolver<11> resolver;
// Pull in only the operation implementations we need.
// This relies on a complete list of all the ops needed by this graph.
// An easier approach is to just use the AllOpsResolver, but this will
// incur some penalty in code space for op implementations that are not
// needed by this graph.
//
// static tflite::AllOpsResolver resolver;
// NOLINTNEXTLINE(runtime-global-variables)
if (resolver.AddConv2D() != kTfLiteOk) {
return -1;
}
if (resolver.AddMaxPool2D() != kTfLiteOk) {
return -1;
}
if (resolver.AddFullyConnected() != kTfLiteOk) {
return -1;
}
if (resolver.AddSoftmax() != kTfLiteOk) {
return -1;
}
if (resolver.AddReshape() != kTfLiteOk) {
return -1;
}
if (resolver.AddExpandDims() != kTfLiteOk) {
return -1;
}
// if (resolver.AddQuantize() != kTfLiteOk) {
// return -1;
// }
// if (resolver.AddDequantize() != kTfLiteOk) {
// return -1;
// }
// if (resolver.AddMul() != kTfLiteOk) {
// return -1;
// }
// if (resolver.AddAdd() != kTfLiteOk) {
// return -1;
// }
// if (resolver.AddLogistic() != kTfLiteOk) {
// return -1;
// }
// Build an interpreter to run the model with.
static tflite::MicroInterpreter static_interpreter(
model, resolver, g_arena, g_arena_len);
interpreter = &static_interpreter;
// Sanity: the model must be 16B aligned (TFLM will then keep alignment for sub-allocs)
if (!is_aligned(g_model, 16)) {
MicroPrintf("Model not 16B aligned -> will likely hard fault");
return -2;
}
// Sanity: the arena base must be 16B aligned (TFLM will then keep alignment for sub-allocs)
if (!is_aligned(g_arena, 16)) {
MicroPrintf("Arena not 16B aligned -> will likely hard fault");
return -2;
}
// Allocate memory from the tensor_arena for the model's tensors.
TfLiteStatus allocate_status = interpreter->AllocateTensors();
if (allocate_status != kTfLiteOk) {
MicroPrintf("AllocateTensors() failed");
return -1;
}
// Obtain pointers to the model's input and output tensors.
input = interpreter->input(0);
output = interpreter->output(0);
in_scale = input->params.scale;
in_zero = input->params.zero_point;
// const float out_scale = output->params.scale;
// const int out_zero = output->params.zero_point;
// Keep track of how many inferences we have performed.
inference_count = 0;
MicroPrintf("Input dims %d", input->dims->size);
MicroPrintf("Output dims %d\n", output->dims->size);
MicroPrintf("Input type %d", input->type);
MicroPrintf("Output type %d", output->type);
// Do an inference for every value in test array
for (int8_t r = 0; r < (std::int8_t)TEST_SIGNALS_ROWS; r++)
{
for (int c = 0; c < (int)TEST_SIGNALS_COLS - 1; c++) {
//int8_t quantized_value = static_cast<int8_t>(TEST_SIGNALS_INT8[r][c] / in_scale) + qZero;
input->data.int8[c] = quantizeValue(TEST_SIGNALS[r][c]);
//input->data.f[c] = TEST_SIGNALS_INT8[r][c];
}
// Run the model on the data input and make sure it succeeds.
TfLiteStatus invoke_status = interpreter->Invoke();
if (invoke_status != kTfLiteOk) {
MicroPrintf("Invoke failed");
return -1;
}
auto should_be = (int8_t)TEST_SIGNALS[r][1000];
MicroPrintf("Result should be: %d", should_be);
for (int8_t x = 0; x < 3; x++)
MicroPrintf("Result %d is %d", x, static_cast<int8_t>(output->data.int8[x]));
MicroPrintf("\n\n");
}
return 0;
}
int8_t TFRunner::quantizeValue(float_t value)
{
int result = static_cast<int8_t>((value / in_scale) + in_zero);
if (result > 127) result = 127;
if (result < -128) result = -128;
return result;
}
extern "C" void* TFRunner_init()
{
return new TFRunner();
}
extern "C" int TFRunner_run(void* ptr)
{
TFRunner* instance = static_cast<TFRunner*>(ptr);
return instance->run();
}
Some ideas around this would be really appreciated.
The fault looks like this:
HardFault!
HFSR: 0x40000000
CFSR: 0x01000000
MMFAR: 0x7E265275
BFAR: 0x7E265275