Note that part of the initial data is discarded in subsequent kernels. For example, the second kernel will discard the first eight inputs.
Four lanes and eight points are chosen for aie::sliding_mul
. Data reads and writes are interleaved with computation.
The first kernel code is as follows:
alignas(aie::vector_decl_align) static cint16 eq_coef0[8]={{1,2},{3,4},...};
//For storing data between graph iterations
static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core0(input_stream<cint16> * sig_in,
output_stream<cacc48> * cascadeout){
const cint16_t * restrict coeff = eq_coef0;
const aie::vector<cint16,8> coe = aie::load_v<8>(coeff);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
main_loop:for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,0);
writeincr(cascadeout,acc);
//8 MAC produce 4 partial output
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,4);
writeincr(cascadeout,acc);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,8);
writeincr(cascadeout,acc);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mul<4,8>(coe,0,buff,12);
writeincr(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core0_init(){
// Drop samples if not first block
int const Delay = 0;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
-
__attribute__((noinline))
is optional to keep function hierarchy. -
chess_prepare_for_pipelining
is optional as the tools can do automatic pipelining. - Each
aie::sliding_mul<4,8>
is multiplying four lanes eight points MAC and the partial result is sent through the cascade chain to the next kernel. - Data
buff
is read starting fromdata_start
parameter ofaie::sliding_mul
. The kernel code goes back to the beginning when it reaches the end in a circular fashion.
Work/aie/<COL_ROW>/<COL_ROW>.log
and the -v
option is needed to generate the verbose report. In the log, search keywords,
such as do-loop
, to find the initiation interval of the
loop. In the following example log file, you can see that the initiating interval of the
loop is
16:(resume algo) -> after folding: 16 (folded over 1 iterations)
-> HW do-loop #128 in ".../Vitis/2023.2/aietools/include/adf/stream/me/stream_utils.h", line 1192: (loop #3) : 16 cycles
The kernel code above takes roughly 16 (cycles) /
16 (partial results) = 1 cycle
to produce a partial output.
alignas(aie::vector_decl_align) static cint16 eq_coef2[8]={{17,18},{19,20},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core1(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cacc48> * cascadeout){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef1);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core1_init()
{
// Drop samples if not first block
int const Delay = 8;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
alignas(aie::vector_decl_align) static cint16 eq_coef2[8]={{33,34},{35,36},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core2(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cacc48> * cascadeout){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef2);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 partial output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(cascadeout,acc);
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(cascadeout,acc);
}
delay_line = buff;
}
void fir_32tap_core2_init(){
// Drop samples if not first block
int const Delay = 16;
for (int i = 0; i < Delay; ++i)
{
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
alignas(aie::vector_decl_align) static cint16 eq_coef3[8]={{49,50},{51,52},...};
//For storing data between graph iterations
alignas(aie::vector_decl_align) static aie::vector<cint16,16> delay_line;
__attribute__((noinline)) void fir_32tap_core3(input_stream<cint16> * sig_in, input_stream<cacc48> * cascadein,
output_stream<cint16> * data_out){
const aie::vector<cint16,8> coe = aie::load_v<8>(eq_coef3);
aie::vector<cint16,16> buff = delay_line;
aie::accum<cacc48,4> acc;
const unsigned LSIZE = (SAMPLES/4/4); // assuming samples is integer power of 2 and greater than 16
for (unsigned int i = 0; i < LSIZE; ++i)
chess_prepare_for_pipelining
{
//8 MAC produce 4 output
acc = readincr_v4(cascadein);
buff.insert(2,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,0);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(3,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,4);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(0,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,8);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
acc = readincr_v4(cascadein);
buff.insert(1,readincr_v<4>(sig_in));
acc = aie::sliding_mac<4,8>(acc,coe,0,buff,12);
writeincr_v4(data_out,acc.to_vector<cint16>(SHIFT));
}
delay_line = buff;
}
void fir_32tap_core3_init()
{
// Drop samples if not first block
int const Delay = 24;
for (int i = 0; i < Delay; ++i){
get_ss(0);
}
//initialize data
for (int i=0;i<8;i++){
int tmp=get_ss(0);
delay_line.set(*(cint16*)&tmp,i);
}
};
The last kernel writes results to the output stream using acc.to_vector<cint16>(SHIFT)
.
Each kernel takes one cycle to produce a partial output. When they are working simultaneously, the system performance is one cycle to produce one output, which meets the design goal.
For more information about graph construction, stream broadcast, DMA FIFO insertion, profiling in simulation and hardware, design stall and deadlock analysis that may be met in system design, see AI Engine Tools and Flows User Guide (UG1076) .