Hard Fault with TFLM on M55

nilssonm · ‎2025-09-25

Hi,

I would like to check if there is any one who could guide me through the jungle of using TFLM in my software. It seems that it don't matter what I do when building a TFLM as a static library and link it to my simple test program. I have tried a lot of configurations and a lot of tries to 16 byte align the model and arena for this, but I am still getting a hard fault because of unaligned access in the AllocateTensors call.

My align of the model looks like this:

alignas(16) const unsigned char g_model_data[] = ...

and my code using TFLM looks as follows:

#include <cstring>

//#include "tensorflow/lite/micro/micro_op_resolver.h"
#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
//#include "tensorflow/lite/micro/kernels/micro_ops.h"
//#include "tensorflow/lite/micro/micro_log.h"
//#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "tensorflow/lite/micro/system_setup.h"

#include "crack_model_quant.h"
#include "test_signals.h"

#include "TFRunner.h"

static inline bool is_aligned(const void* p, size_t n) {
  return (reinterpret_cast<uintptr_t>(p) & (n - 1)) == 0;
}
// Round a pointer up to the next N-byte boundary (N must be power of 2)
template <size_t N>
static inline uint8_t* align_ptr(uint8_t* p) {
    return reinterpret_cast<uint8_t*>(
        (reinterpret_cast<uintptr_t>(p) + (N - 1)) & ~static_cast<uintptr_t>(N - 1));
}

// If you can’t regenerate easily, you can point to the next aligned address:
// const unsigned char* g_model = reinterpret_cast<const unsigned char*>(align_ptr<16>(const_cast<uint8_t*>(g_model_data)));
// const int g_model_len = g_model_data_len - (g_model - g_model_data);
// const unsigned char* g_model = g_model_data;
// const int g_model_len = g_model_data_len;

// Globals, used for compatibility with Arduino-style sketches.
namespace {
    const tflite::Model* model = nullptr;
    tflite::MicroInterpreter* interpreter = nullptr;
    //static tflite::MicroErrorReporter micro_error_reporter;

    int inference_count = 0;
    //double scale = 0.032983943819999695;
    //int zero_point = -1;
    // float qScale = 0.032983943819999695;
    // int8_t qZero = -1;

    // constexpr int kTensorArenaSize = 100 * 1024;
    // uint8_t tensor_arena[kTensorArenaSize];

    // Choose a size that fits your model; increase if you see kTfLiteArenaRw overflow
    constexpr size_t kArenaSize = 70 * 1024;

    // Over-allocate and then align the working pointer to 16 bytes
    // #if defined(__GNUC__)
    // __attribute__((aligned(16)))
    // #endif
    alignas(16) static uint8_t g_arena_raw[kArenaSize + 16];

    static uint8_t* g_arena = align_ptr<16>(g_arena_raw);
    static size_t   g_arena_len = kArenaSize; // keep logical size the same
}  // namespace

TfLiteTensor* input = nullptr;
TfLiteTensor* output = nullptr;

float in_scale;
int in_zero;


int TFRunner::run()
{
    // Enable the DBGMCU clock
    __HAL_RCC_DBGMCU_CLK_ENABLE();

    // Enable the debugger during Sleep mode
    HAL_DBGMCU_EnableDBGSleepMode();

    // Enable the debugger during Stop mode
    HAL_DBGMCU_EnableDBGStopMode();

    // Enable the debugger during Standby mode
    HAL_DBGMCU_EnableDBGStandbyMode();

    alignas(16) static uint8_t g_model_aligned[g_model_data_len];
    memcpy(g_model_aligned, g_model_data, g_model_data_len);
    const unsigned char* g_model = g_model_aligned;
    const int g_model_len = g_model_data_len;

    // Your code here
    tflite::InitializeTarget();

    // Map the model into a usable data structure. This doesn't involve any
    // copying or parsing, it's a very lightweight operation.
    model = tflite::GetModel(g_model);
    if (model->version() != TFLITE_SCHEMA_VERSION) {
        MicroPrintf(
                "Model provided is schema version %d not equal "
                "to supported version %d.",
                model->version(), TFLITE_SCHEMA_VERSION);
        return -1;
    }

    // This pulls in all the operation implementations we need.
    // NOLINTNEXTLINE(runtime-global-variables)
    static tflite::MicroMutableOpResolver<11> resolver;

    // Pull in only the operation implementations we need.
    // This relies on a complete list of all the ops needed by this graph.
    // An easier approach is to just use the AllOpsResolver, but this will
    // incur some penalty in code space for op implementations that are not
    // needed by this graph.
    //
    // static tflite::AllOpsResolver resolver;
    // NOLINTNEXTLINE(runtime-global-variables)
    if (resolver.AddConv2D() != kTfLiteOk) {
        return -1;
    }
    if (resolver.AddMaxPool2D() != kTfLiteOk) {
        return -1;
    }
    if (resolver.AddFullyConnected() != kTfLiteOk) {
        return -1;
    }
    if (resolver.AddSoftmax() != kTfLiteOk) {
        return -1;
    }
    if (resolver.AddReshape() != kTfLiteOk) {
        return -1;
    }
    if (resolver.AddExpandDims() != kTfLiteOk) {
        return -1;
    }
    // if (resolver.AddQuantize() != kTfLiteOk) {
    //     return -1;
    // }
    // if (resolver.AddDequantize() != kTfLiteOk) {
    //     return -1;
    // }
    // if (resolver.AddMul() != kTfLiteOk) {
    //     return -1;
    // }
    // if (resolver.AddAdd() != kTfLiteOk) {
    //     return -1;
    // }
    // if (resolver.AddLogistic() != kTfLiteOk) {
    //     return -1;
    // }

    // Build an interpreter to run the model with.
    static tflite::MicroInterpreter static_interpreter(
            model, resolver, g_arena, g_arena_len);
    interpreter = &static_interpreter;

    // Sanity: the  model must be 16B aligned (TFLM will then keep alignment for sub-allocs)
    if (!is_aligned(g_model, 16)) {
        MicroPrintf("Model not 16B aligned -> will likely hard fault");
        return -2;
    }

    // Sanity: the arena base must be 16B aligned (TFLM will then keep alignment for sub-allocs)
    if (!is_aligned(g_arena, 16)) {
        MicroPrintf("Arena not 16B aligned -> will likely hard fault");
        return -2;
    }


    // Allocate memory from the tensor_arena for the model's tensors.
    TfLiteStatus allocate_status = interpreter->AllocateTensors();
    if (allocate_status != kTfLiteOk) {
        MicroPrintf("AllocateTensors() failed");
        return -1;
    }

    // Obtain pointers to the model's input and output tensors.
    input = interpreter->input(0);
    output = interpreter->output(0);

    in_scale  = input->params.scale;
    in_zero   = input->params.zero_point;
    // const float  out_scale = output->params.scale;
    // const int    out_zero  = output->params.zero_point;


    // Keep track of how many inferences we have performed.
    inference_count = 0;

    MicroPrintf("Input dims %d", input->dims->size);
    MicroPrintf("Output dims %d\n", output->dims->size);

    MicroPrintf("Input type %d", input->type);
    MicroPrintf("Output type %d", output->type);

    // Do an inference for every value in test array
    for (int8_t r = 0; r < (std::int8_t)TEST_SIGNALS_ROWS; r++)
    {
        for (int c = 0; c < (int)TEST_SIGNALS_COLS - 1; c++) {
            //int8_t quantized_value = static_cast<int8_t>(TEST_SIGNALS_INT8[r][c] / in_scale) + qZero;
            input->data.int8[c] = quantizeValue(TEST_SIGNALS[r][c]);


            //input->data.f[c] = TEST_SIGNALS_INT8[r][c];
        }

        // Run the model on the data input and make sure it succeeds.
        TfLiteStatus invoke_status = interpreter->Invoke();
        if (invoke_status != kTfLiteOk) {
            MicroPrintf("Invoke failed");
            return -1;
        }

        auto should_be = (int8_t)TEST_SIGNALS[r][1000];
        MicroPrintf("Result should be: %d", should_be);
        for (int8_t x = 0; x < 3; x++)
            MicroPrintf("Result %d is %d", x, static_cast<int8_t>(output->data.int8[x]));
        MicroPrintf("\n\n");
    }

    return 0;
}


int8_t TFRunner::quantizeValue(float_t value)
{
    int result = static_cast<int8_t>((value / in_scale) + in_zero);
    if (result > 127) result = 127;
    if (result < -128) result = -128;
    return result;
}


extern "C" void* TFRunner_init()
{
    return new TFRunner();
}

extern "C" int TFRunner_run(void* ptr)
{
    TFRunner* instance = static_cast<TFRunner*>(ptr);
    return instance->run();
}

Some ideas around this would be really appreciated.

The fault looks like this:

HardFault!
HFSR:  0x40000000
CFSR:  0x01000000
MMFAR: 0x7E265275
BFAR:  0x7E265275