To support AI Engine to PL to DDR memory connections in the AI Engine tools, the PL kernel that has the AXI4-Stream interface and AXI4 Master interface is required inside the graph. The AXI4-Stream interface of PL kernel is connected to the AI Engine window or stream data interface, and the AXI4 Master interface is connect to the GMIO ports in the graph. ADF APIs take care of buffer object (BO) allocations and DMA data movement events in the DDR memory.
The following is an example GMIO specification in the graph.
class mygraph_pl: public adf::graph
{
private:
adf::kernel k;
adf::kernel k_mm2s;
adf::kernel k_s2mm;
public:
adf::port<adf::direction::inout> inoutMem;
adf::port<adf::direction::in> inMem;
mygraph_pl()
{
k = adf::kernel::create(vec_incr);
adf::source(k) = "vec_incr.cc";
adf::runtime<adf::ratio>(k)= 1;
k_mm2s= adf::kernel::create(mm2s);
adf::source(k_mm2s) = "fpga/mm2s.cpp";
adf::fabric<adf::pl>(k_mm2s);
k_s2mm = adf::kernel::create(s2mm);
adf::source(k_s2mm) = "fpga/s2mm.cpp";
adf::fabric<adf::pl>(k_s2mm);
adf::connect<adf::gmem>(inMem,k_mm2s.in[0]);
adf::connect<adf::stream,adf::window<256>>(k_mm2s.out[0],k.in[0]);
adf::connect<adf::window<256>,adf::stream>(k.out[0],k_s2mm.in[0]);
adf::connect<adf::gmem>(k_s2mm.inout[0],inoutMem);
};
};
In the preceding graph example code, adf::gmem
indicates that the connection type between the PL kernels (MM2S and
S2MM) and the graph ports is GMIO, which tell the AI Engine tools to automatically use NMU for DMA transactions. Example code for HLS MM2S and S2MM
modules is as follows.
void mm2s(const ap_int<128>* mem, hls::stream<ap_axis<128, 0, 0, 0> >& s) {
#pragma HLS INTERFACE m_axi port=mem offset=slave bundle=gmem
#pragma HLS interface axis port=s
#pragma HLS INTERFACE s_axilite port=mem bundle=control
#pragma HLS interface s_axilite port=return bundle=control
for(int i = 0; i < 128; i++) {//2048 Bytes per iteration
#pragma HLS PIPELINE II=1
ap_axis<128, 0, 0, 0> x;
x.data=mem[i];
x.keep=-1;
x.last=0;
s.write(x);
}
}
void s2mm(ap_int<128>* mem, hls::stream<ap_axis<128, 0, 0, 0> >& s) {
#pragma HLS INTERFACE m_axi port=mem offset=slave bundle=gmem
#pragma HLS interface axis port=s
#pragma HLS INTERFACE s_axilite port=mem bundle=control
#pragma HLS interface s_axilite port=return bundle=control
for(int i = 0; i < 128; i++) {//2048 Bytes per iteration
#pragma HLS PIPELINE II=1
ap_axis<128, 0, 0, 0> x = s.read();
mem[i] = x.data;
}
}
The preceding MM2S and S2MM code uses 2048 bytes per invocation. But in the graph code the window size between the PL kernel and the AI Engine kernel is 256 bytes. Therefore, it needs eight iterations to match the PL kernel to the AI Engine kernel. The following is example host code for the GMIO specification.
using namespace adf;
GMIO gmioInPL("gmioInPL", 256, 1000);
GMIO gmioOutPL("gmioOutPL", 256, 100);
adf::simulation::platform<1,1> platform(&gmioInPL,&gmioOutPL);
mygraph_pl gr_pl;
adf::connect<> c0(platform.src[0], gr_pl.inMem);
adf::connect<> c1(gr_pl.inoutMem,platform.sink[0]);
const int ITERATION=8;
const int BLOCK_SIZE_in_Bytes=2048;
int main(int argc, char ** argv) {
gr_pl.init();
int error=0;
int32* inMem;
int32* outMem;
inMem=(int32*)GMIO::malloc(BLOCK_SIZE_in_Bytes);
outMem=(int32*)GMIO::malloc(BLOCK_SIZE_in_Bytes);
//Pre-processing
for(int j=0;j<ITERATION*256/sizeof(int32);j++){
inMem[j]=j;
}
std::cout<<"Graph PL run start"<<std::endl;
gmioInPL.pl_gm(inMem, 2048);
gmioOutPL.pl_gm(outMem, 2048);
std::cout<<"GMIO::pl_gm enqueing completed"<<std::endl;
gr_pl.run(ITERATION);
gr_pl.wait();
std::cout<<"Graph PL run end"<<std::endl;
//Post-processing
for(int j=0;j<ITERATION*256/sizeof(int32);j++){
if(outMem[j]!=j+1){
std::cout<<"ERROR:dout["<<j<<"]="<<outMem[j]<<std::endl;
error++;
}
}
GMIO::free(inMem);
GMIO::free(outMem);
std::cout<<"GMIO::free completed"<<std::endl;
if(error==0){
std::cout<<"PASS!"<<std::endl;
}else{
std::cout<<"ERROR!"<<std::endl;
}
return error;
};
This example declares two GMIO objects, gmioInPL
and gmioOutPL
. The constructor
specifies the logical name of the GMIO, the burst length of the memory-mapped AXI4 transaction, and the required bandwidth (in MB/s).
In the PS program, GMIO::malloc
must be
used to allocate memory in the DDR memory. The memory should be initialized before GMIO
transactions happen. GMIO::pl_gm
is specific for the PL
GMIO to perform DMA data transformation. The ADF API calls the Xilinx Runtime Library (XRT)
behind the scenes to perform BO management. One important difference between the PL GMIO and
the AI Engine GMIO is that graph::run()
must be called after GMIO::pl_gm
for the PL GMIO. However, for the AI Engine GMIO, graph::run()
can be called before or after GMIO operations
(GMIO::gm2aie
, GMIO::aie2gm
, GMIO::gm2aie_nb
, and GMIO::aie2gm_nb
). After graph::run()
, graph::wait()
can be called to
wait for the graph to complete. Thus, the GMIO output to DDR memory has also been
synchronized. Then post-processing of DDR memory data can be done. After the data
processing, GMIO::free
must be called to free the allocated
memory.
--pl-axi-lite=true
is required for
the AI Engine compiler.In the example, the MM2S and S2MM PL kernels are transferring a fixed number of data. It is possible to introduce a parameter in the MM2S and S2MM to specify the number of data to be transferred. This is used as a scalar RTP in the graph. The code is as follows:
void mm2s(const ap_int<128>* mem, hls::stream<ap_axis<128, 0, 0, 0> >& s,const int size) {
#pragma HLS INTERFACE m_axi port=mem offset=slave bundle=gmem
#pragma HLS interface axis port=s
#pragma HLS INTERFACE s_axilite port=mem bundle=control
#pragma HLS INTERFACE s_axilite port=size bundle=control
#pragma HLS interface s_axilite port=return bundle=control
for(int i = 0; i < size; i++) {//size*128/8 Bytes per iteration
#pragma HLS PIPELINE II=1
ap_axis<128, 0, 0, 0> x;
x.data=mem[i];
x.keep=-1;
x.last=0;
s.write(x);
}
}
void s2mm(ap_int<128>* mem, hls::stream<ap_axis<128, 0, 0, 0> >& s, const int size) {
#pragma HLS INTERFACE m_axi port=mem offset=slave bundle=gmem
#pragma HLS interface axis port=s
#pragma HLS INTERFACE s_axilite port=mem bundle=control
#pragma HLS INTERFACE s_axilite port=size bundle=control
#pragma HLS interface s_axilite port=return bundle=control
for(int i = 0; i < size; i++) {//size*128/8 Bytes per iteration
#pragma HLS PIPELINE II=1
ap_axis<128, 0, 0, 0> x = s.read();
mem[i] = x.data;
}
}
class mygraph_pl: public adf::graph
{
private:
adf::kernel k;
public:
adf::kernel k_mm2s;
adf::kernel k_s2mm;
adf::port<adf::direction::inout> inoutMem;
adf::port<adf::direction::in> inMem;
mygraph_pl()
{
k = adf::kernel::create(vec_incr);
adf::source(k) = "vec_incr.cc";
adf::runtime<adf::ratio>(k)= 1;
k_mm2s= adf::kernel::create(mm2s);
adf::source(k_mm2s) = "fpga/mm2s.cpp";
adf::fabric<adf::pl>(k_mm2s);
k_s2mm = adf::kernel::create(s2mm);
adf::source(k_s2mm) = "fpga/s2mm.cpp";
adf::fabric<adf::pl>(k_s2mm);
adf::connect<adf::gmem>(inMem,k_mm2s.in[0]);
adf::connect<adf::stream,adf::window<256>>(k_mm2s.out[0],k.in[0]);
adf::connect<adf::window<256>,adf::stream>(k.out[0],k_s2mm.in[0]);
adf::connect<adf::gmem>(k_s2mm.inout[0],inoutMem);
adf::async(k_mm2s.in[1]);
adf::async(k_s2mm.in[1]);
};
};
using namespace adf;
GMIO gmioInPL("gmioInPL_0", 256, 1000);
GMIO gmioOutPL("gmioOutPL_0", 256, 100);
adf::simulation::platform<1,1> platform(&gmioInPL,&gmioOutPL);
mygraph_pl gr_pl;
adf::connect<> c0(platform.src[0], gr_pl.inMem);
adf::connect<> c1(gr_pl.inoutMem,platform.sink[0]);
const int ITERATION=8;
const int BLOCK_SIZE_in_Bytes=2048;
int main(int argc, char ** argv) {
gr_pl.init();
int error=0;
int32* inMem;
int32* outMem;
inMem=(int32*)GMIO::malloc(BLOCK_SIZE_in_Bytes);
outMem=(int32*)GMIO::malloc(BLOCK_SIZE_in_Bytes);
for(int j=0;j<ITERATION*256/sizeof(int32);j++){
inMem[j]=j;
}
std::cout<<"Graph PL run start"<<std::endl;
gmioInPL.pl_gm(inMem, 2048);
gmioOutPL.pl_gm(outMem, 2048);
gr_pl.update(gr_pl.k_mm2s.in[1],128);
gr_pl.update(gr_pl.k_s2mm.in[1],128);
std::cout<<"GMIO::pl_gm enqueing completed"<<std::endl;
gr_pl.run(ITERATION);
gr_pl.wait();
std::cout<<"Graph PL run end"<<std::endl;
for(int j=0;j<ITERATION*256/sizeof(int32);j++){
if(outMem[j]!=j+1){
std::cout<<"ERROR:dout["<<j<<"]="<<outMem[j]<<std::endl;
error++;
}
}
GMIO::free(inMem);
GMIO::free(outMem);
std::cout<<"GMIO::free completed"<<std::endl;
if(error==0){
std::cout<<"PASS!"<<std::endl;
}else{
std::cout<<"ERROR!"<<std::endl;
}
return error;
};
pl_gm
APIs.