Implementing an IIR Filter on the AI Engine - Part 2b
Version: Vitis 2024.2
Preliminaries
In Part 2a, we examined the generated assembler code and found a NOP
(no operation) between the VFPMAC
(vector floating-point multiply-accumulate) mnemonics. This NOP
is unavoidable as a floating-point accumulation requires two cycles (see Fig. 26 of AM009).
We can split the matrix-vector multiplication into two separate multiply-accumulate operations to perform a floating-point accumulation on each cycle.
Note: Instead of the “traditional” method of multiplying each row of the matrix by the column vector, we effectively scale each column of the matrix by the corresponding element in the vector with the multiply-accumulate API.
Thus, splitting the vector additions into even and odd parts allow us to perform independent multiply-accumulate operations:
Also, the AI Engine has two load units. The Julia program aie_iir_2b.jl
is modified to split the matrix into even and odd columns and generate two separate header files.
We start by using the AI Engine APIs.
Kernel Header
#ifndef __KERNEL_HPP__ // include guard to prevent multiple inclusion
#define __KERNEL_HPP__
#include <adf.h> // Adaptive DataFlow header
#include <aie_api/aie.hpp> // header files for high-level intrinsics
using Vector8f = aie::vector<float, 8>; // vector of 8 floating-point elements
using Vector16f = aie::vector<float, 16>; // vector of 16 floating-point elements
using VAcc8f = aie::accum<accfloat, 8>; // accumulator with 8 floating-point elements
define USE_API // comment out to use low-level intrinsics
const unsigned burst_cnt = 256; // process burst_cnt * 8 samples per function invocation
template<unsigned id>
void SecondOrderSection(
adf::input_buffer<float> & __restrict idata, // 8 input samples per iteration
adf::output_buffer<float> & __restrict odata, // 8 output samples per iteration
const float (&C_e)[48], // run-time parameter: SIMD matrix of coefficients (even columns)
const float (&C_o)[48] // run-time parameter: SIMD matrix of coefficients (odd columns)
);
#endif // __KERNEL_HPP__
Kernel Code (AI Engine API)
#include <aie_api/aie_adf.hpp>
#include "kernel.hpp"
template<unsigned id>
void SecondOrderSection(
adf::input_buffer<float> & __restrict idata, // 8 input samples per iteration
adf::output_buffer<float> & __restrict odata, // 8 output samples per iteration
const float (&C_e)[48], // run-time parameter: SIMD matrix of coefficients (even columns)
const float (&C_o)[48] // run-time parameter: SIMD matrix of coefficients (odd columns)
) {
static Vector8f state_reg = aie::zeros<float, 8>(); // clear states
// input/output iterators
auto inIter = aie::begin_vector<8>(idata);
auto outIter = aie::begin_vector<8>(odata);
for (auto i = 0; i < burst_cnt; i++) {
Vector8f xreg_hi = *inIter++; // fetch input samples
Vector16f xreg = aie::concat(state_reg, xreg_hi);
auto ecoeff_iter = aie::begin_vector<8>(&C_e[0]);
auto ocoeff_iter = aie::begin_vector<8>(&C_o[0]);
VAcc8f acc_e = aie::zeros<accfloat, 8>(); // even accumulator
VAcc8f acc_o = aie::zeros<accfloat, 8>(); // odd accumulator
for (auto j = 0; j < 6; j++) {
acc_e = aie::mac(acc_e, xreg.get(2 * j + 4), *ecoeff_iter++); // even columns
acc_o = aie::mac(acc_o, xreg.get(2 * j + 5), *ocoeff_iter++); // odd columns
} // end for (auto j = 0; j < 6; j ++)
acc_o = aie::add(acc_o, acc_e.to_vector()); // acc_o += acc_e
Vector8f yout = acc_o.to_vector();
// update states
state_reg = xreg_hi;
state_reg[4] = yout[6];
state_reg[5] = yout[7];
*outIter++ = yout;
} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()
Note the two loops in the function:
for (auto i = 0; i < burst_cnt; i++) { // process more samples to reduce overhead
...
for (auto j = 0; j < 6; j++) { // matrix-vector multiplication
...
}
}
The outer for
loop is added such that more samples can be processed during each function call, thereby reducing the ratio of function call cycles to processing cycles and improving throughput.
Graph Code
#ifndef __GRAPH_H__ // include guard to prevent multiple inclusion
#define __GRAPH_H__
#include <adf.h> // Adaptive DataFlow header
#include "kernel.hpp"
using namespace adf;
// dataflow graph declaration
class the_graph : public graph { // inherit all properties of the adaptive dataflow graph
public:
input_plio pl_in;
output_plio pl_out;
kernel section1;
input_port cmtx_e; // input port for SIMD matrix coefficients (even columns)
input_port cmtx_o; // input port for SIMD matrix coefficients (odd columns)
// constructor
the_graph() {
// associate the kernel with the function to be executed
section1 = kernel::create(SecondOrderSection<1>);
pl_in = input_plio::create("Input", plio_32_bits, "data/input.dat");
pl_out = output_plio::create("Output", plio_32_bits, "output.dat");
const unsigned num_samples = 8 * burst_cnt;
// declare buffer sizes
dimensions(section1.in[0]) = {num_samples};
dimensions(section1.out[0]) = {num_samples};
// establish connections
connect<parameter>(cmtx_e, adf::async(section1.in[1]));
connect<parameter>(cmtx_o, adf::async(section1.in[2]));
connect(pl_in.out[0], section1.in[0]);
connect(section1.out[0], pl_out.in[0]);
// specify which source code file contains the kernel function
source(section1) = "kernel.cpp";
// !!! temporary value: assumes this kernel dominates the AI engine tile !!!
runtime<ratio>(section1) = 1.0;
} // end the_graph()
}; // end class the_graph
#endif // __GRAPH_H__
Testbench Code
#include "kernel.hpp"
#include "graph.hpp"
#include "C1_e.h"
#include "C1_o.h"
using namespace std;
using namespace adf;
// specify the DFG
the_graph my_graph;
// main simulation program
int main() {
my_graph.init(); // load the DFG into the AI engine array, establish connectivity, etc.
my_graph.update(my_graph.cmtx_e, C1_e, 48);
my_graph.update(my_graph.cmtx_o, C1_o, 48);
my_graph.run(1); // run the DFG for the specified number of iterations
my_graph.end(); // housekeeping
return (0);
} // end main()