Typical workflow for neural network layers:
// 1. Initialize weights (once)
float *weights = load_weights();
// 2. Reorder weights for optimal performance (once)
size_t reorder_size = aocl_get_reorder_buf_size_f32f32f32of32(...);
float *weights_reordered = malloc(reorder_size);
aocl_reorder_f32f32f32of32(..., weights, weights_reordered, ...);
// 3. Set up post-operations (bias + activation)
aocl_post_op post_ops;
setup_post_ops(&post_ops, bias, activation_type);
// 4. Process inputs (repeated)
for (int batch = 0; batch < num_batches; batch++) {
aocl_gemm_f32f32f32of32(
'R', 'N', 'N', batch_size, output_dim, input_dim,
1.0f, input[batch], input_dim, 'N',
weights_reordered, output_dim, 'R',
0.0f, output[batch], output_dim,
&post_ops
);
}