#include <aie_api/aie_adf.hpp>
#include "kernel.hpp"
template<unsigned id>
void SecondOrderSection(
input_window<float> *idata,
output_window<float> *odata,
const float (&C_e)[48], // run-time parameter: SIMD matrix of coefficients (even columns)
const float (&C_o)[48] // run-time parameter: SIMD matrix of coefficients (odd columns)
) {
static Vector8f state_reg = aie::zeros<float, 8>(); // clear states
for (auto i = 0; i < burst_cnt; i++)
//chess_prepare_for_pipelining
{
Vector8f xreg_hi = window_readincr_v<8>(idata); // fetch input samples
Vector16f xreg = aie::concat(state_reg, xreg_hi);
VAcc8f acc_e = aie::zeros<accfloat, 8>(); // even accumulator
VAcc8f acc_o = aie::zeros<accfloat, 8>(); // odd accumulator
for (auto j = 0; j < 6; j++) {
Vector8f coeff_e = aie::load_v<8>(&C_e[8 * j]); // even columns
float xreg_e = xreg[2 * j + 4];
acc_e = aie::mac(acc_e, xreg_e, coeff_e);
Vector8f coeff_o = aie::load_v<8>(&C_o[8 * j]); // odd columns
float xreg_o = xreg[2 * j + 5];
acc_o = aie::mac(acc_o, xreg_o, coeff_o);
} // end for (auto j = 0; j < 6; j ++)
acc_o = aie::add(acc_o, acc_e.to_vector()); // acc_o += acc_e
Vector8f yout = acc_o.to_vector();
// update states
state_reg = xreg_hi;
state_reg[4] = yout[6];
state_reg[5] = yout[7];
window_writeincr(odata, yout);
} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()
Note the 2 loops in the function:
for (auto i = 0; i < burst_cnt; i++) { // process more samples to reduce overhead
...
for (auto j = 0; j < 6; j++) { // matrix-vector multiplication
...
}
}
The outer for
loop is added such that more samples can be processed during each function call, thereby reducing overhead and improving throughput.