Part2b - 2024.1 English

Vitis Tutorials: AI Engine

Document ID
XD100
Release Date
2024-10-30
Version
2024.1 English

Implementing an IIR Filter on the AI Engine - Part 2b

Version: Vitis 2024.2

Preliminaries

In Part 2a, we examined the generated assembler code and found a NOP (no operation) between the VFPMAC (vector floating-point multiply-accumulate) mnemonics. This NOP is unavoidable as a floating-point accumulation requires two cycles (see Fig. 26 of AM009).

We can split the matrix-vector multiplication into two separate multiply-accumulate operations to perform a floating-point accumulation on each cycle.

Note: Instead of the “traditional” method of multiplying each row of the matrix by the column vector, we effectively scale each column of the matrix by the corresponding element in the vector with the multiply-accumulate API.

Fig. 1

Thus, splitting the vector additions into even and odd parts allow us to perform independent multiply-accumulate operations:

Fig. 2

Also, the AI Engine has two load units. The Julia program aie_iir_2b.jl is modified to split the matrix into even and odd columns and generate two separate header files.

We start by using the AI Engine APIs.

Kernel Header

#ifndef __KERNEL_HPP__	// include guard to prevent multiple inclusion

	#define __KERNEL_HPP__

	#include <adf.h>			// Adaptive DataFlow header
	#include <aie_api/aie.hpp>	// header files for high-level intrinsics

	using Vector8f = aie::vector<float, 8>;		// vector of 8 floating-point elements
	using Vector16f = aie::vector<float, 16>;	// vector of 16 floating-point elements
	using VAcc8f = aie::accum<accfloat, 8>;		// accumulator with 8 floating-point elements

	define USE_API	// comment out to use low-level intrinsics

	const unsigned burst_cnt = 256;	// process burst_cnt * 8 samples per function invocation

	template<unsigned id>
	void SecondOrderSection(
		adf::input_buffer<float> & __restrict idata,	// 8 input samples per iteration
		adf::output_buffer<float> & __restrict odata,	// 8 output samples per iteration
		const float (&C_e)[48],		// run-time parameter: SIMD matrix of coefficients (even columns)
		const float (&C_o)[48]		// run-time parameter: SIMD matrix of coefficients (odd columns)
	);

#endif // __KERNEL_HPP__

Kernel Code (AI Engine API)

#include <aie_api/aie_adf.hpp>

#include "kernel.hpp"

template<unsigned id>
void SecondOrderSection(
	adf::input_buffer<float> & __restrict idata,	// 8 input samples per iteration
	adf::output_buffer<float> & __restrict odata,	// 8 output samples per iteration
	const float (&C_e)[48],		// run-time parameter: SIMD matrix of coefficients (even columns)
	const float (&C_o)[48]		// run-time parameter: SIMD matrix of coefficients (odd columns)
) {
	static Vector8f state_reg = aie::zeros<float, 8>();	// clear states
	// input/output iterators
	auto inIter = aie::begin_vector<8>(idata);
	auto outIter = aie::begin_vector<8>(odata);
	for (auto i = 0; i < burst_cnt; i++) {
		Vector8f xreg_hi = *inIter++;		// fetch input samples
		Vector16f xreg = aie::concat(state_reg, xreg_hi);
		auto ecoeff_iter = aie::begin_vector<8>(&C_e[0]);
		auto ocoeff_iter = aie::begin_vector<8>(&C_o[0]);
		VAcc8f acc_e = aie::zeros<accfloat, 8>();	// even accumulator
		VAcc8f acc_o = aie::zeros<accfloat, 8>();	// odd accumulator
		for (auto j = 0; j < 6; j++) {
			acc_e = aie::mac(acc_e, xreg.get(2 * j + 4), *ecoeff_iter++);	// even columns
			acc_o = aie::mac(acc_o, xreg.get(2 * j + 5), *ocoeff_iter++);	// odd columns
		} // end for (auto j = 0; j < 6; j ++)
		acc_o = aie::add(acc_o, acc_e.to_vector());	// acc_o += acc_e
		Vector8f yout = acc_o.to_vector();
		// update states
		state_reg = xreg_hi;
		state_reg[4] = yout[6];
		state_reg[5] = yout[7];
		*outIter++ = yout;
	} // end for (auto i = 0; i < burst_cnt; i++)
} // end SecondOrderSection()

Note the two loops in the function:

for (auto i = 0; i < burst_cnt; i++) {	// process more samples to reduce overhead
	...
	for (auto j = 0; j < 6; j++) {	// matrix-vector multiplication
		...
	}
}

The outer for loop is added such that more samples can be processed during each function call, thereby reducing the ratio of function call cycles to processing cycles and improving throughput.

Graph Code

#ifndef __GRAPH_H__			// include guard to prevent multiple inclusion

	#define __GRAPH_H__

	#include <adf.h>		// Adaptive DataFlow header

	#include "kernel.hpp"

	using namespace adf;

	// dataflow graph declaration
	class the_graph : public graph {	// inherit all properties of the adaptive     dataflow graph

		public:
			input_plio pl_in;
			output_plio pl_out;

			kernel section1;
			input_port cmtx_e;	// input port for SIMD matrix coefficients (even columns)
			input_port cmtx_o;	// input port for SIMD matrix coefficients (odd columns)

			// constructor
			the_graph() {

				// associate the kernel with the function to be executed
				section1 = kernel::create(SecondOrderSection<1>);

				pl_in = input_plio::create("Input", plio_32_bits, "data/input.dat");
				pl_out = output_plio::create("Output", plio_32_bits, "output.dat");

				const unsigned num_samples = 8 * burst_cnt;

				// declare buffer sizes
				dimensions(section1.in[0]) = {num_samples};
				dimensions(section1.out[0]) = {num_samples};

				// establish connections

				connect<parameter>(cmtx_e, adf::async(section1.in[1]));
				connect<parameter>(cmtx_o, adf::async(section1.in[2]));

				connect(pl_in.out[0], section1.in[0]);
				connect(section1.out[0], pl_out.in[0]);

				// specify which source code file contains the kernel function
				source(section1) = "kernel.cpp";

				// !!! temporary value: assumes this kernel dominates the AI engine tile !!!
				runtime<ratio>(section1) = 1.0;

			} // end the_graph()

	}; // end class the_graph

#endif // __GRAPH_H__

Testbench Code

#include "kernel.hpp"
#include "graph.hpp"
#include "C1_e.h"
#include "C1_o.h"

using namespace std;

using namespace adf;

// specify the DFG
the_graph my_graph;

// main simulation program
int main() {

	my_graph.init();	// load the DFG into the AI engine array, establish connectivity, etc.

	my_graph.update(my_graph.cmtx_e, C1_e, 48);
	my_graph.update(my_graph.cmtx_o, C1_o, 48);

	my_graph.run(1);	// run the DFG for the specified number of iterations

	my_graph.end();		// housekeeping

	return (0);

} // end main()

Analysis (using AI Engine API)

Generated Code