Efficient processing of multiple similar operations:
// Prepare batch data
float **a_array = malloc(batch_count * sizeof(float*));
float **b_array = malloc(batch_count * sizeof(float*));
float **c_array = malloc(batch_count * sizeof(float*));
// Fill arrays with matrix pointers
for (int i = 0; i < batch_count; i++) {
a_array[i] = &input_matrices[i * m * k];
b_array[i] = &weight_matrices[i * k * n];
c_array[i] = &output_matrices[i * m * n];
}
// Process batch
aocl_batch_gemm_f32f32f32of32(
'R', 'N', 'N', m, n, k,
1.0f, a_array, k,
b_array, n,
0.0f, c_array, n,
batch_count, NULL
);