The reload process requires the host application to use the AI Engine-only XCLBIN and PL-only XCLBIN
generated in the previous steps. Each XCLBIN file has a universally unique
identifier (UUID), which you can retrieve using the XRT
device.register_xclbin API.
XRT introduces a hardware context to represent an AI Engine partition or PL region in hardware. You can create the hardware context using the device and UUID. When creating buffers, PL kernel handles, or AI Engine graph handles, you can associate them with their corresponding hardware context to ensure proper isolation across hardware partitions.
The host application code includes the following operations:
- Open the device.
- Load the PL-only XCLBIN and retrieve the PL UUID. Then, create a hardware context for the PL region.
- Load the AI Engine-only XCLBIN and retrieve the AI Engine UUID. Create hardware contexts for AI Engine partitions as needed—multiple contexts can be created for different partitions.
- Create buffer objects, PL kernel handles, and graph handles. You can associate each object or handle with a specific hardware context.
- Pre-process the user input data.
- Operate on the created objects or handles during execution.
- Post-process the output data after execution.
Example host code is as follows:
// Include XRT headers
#include "xrt/xrt_kernel.h"
#include "xrt/xrt_graph.h"
#include "xrt/xrt_aie.h"
......
// Open xclbin
auto device = xrt::device(0); //device index=0
// Open PL only xclbin
auto xclbin_pl = xrt::xclbin(xclbinFilename_pl);
auto uuid_pl = device.register_xclbin(xclbin_pl);
// Create hardware context for PL region
xrt::hw_context hwctx_pl{device, uuid_pl};
// Create hardware context for an AI Engine partition
auto xclbin_aie_pr2 = xrt::xclbin(xclbinFilename_aie_pr2);
auto uuid_aie_pr2 = device.register_xclbin(xclbin_aie_pr2);
xrt::hw_context hwctx_aie_pr2{device, uuid_aie_pr2};
std::cout<<"Creating context done"<<std::endl;
// s2mm & mm2s PL kernel handles
auto s2mm = xrt::kernel(hwctx_pl, "s2mm:{s2mm_2}");
auto mm2s = xrt::kernel(hwctx_pl, "mm2s");
std::cout<<"PL kernel creation done"<<std::endl;
// output memory for PL kernel
auto out_bo = xrt::bo(hwctx_pl, output_size_in_bytes,static_cast<xrt::bo::flags>(0), s2mm.group_id(0));
auto host_out=out_bo.map<int*>();
// input memory for PL kernel
auto in_bo = xrt::bo(hwctx_pl, output_size_in_bytes,static_cast<xrt::bo::flags>(0), mm2s.group_id(0));
auto host_in=in_bo.map<int*>();
std::cout<<"In & Out buffer creation done"<<std::endl;
// Pre-process ......
//kernel run
auto s2mm_run = s2mm(out_bo, nullptr, OUTPUT_SIZE);//1st run for s2mm has started
auto mm2s_run = mm2s(in_bo, nullptr, OUTPUT_SIZE);
std::cout<<"PL kernel launching done "<<std::endl;
// Launch graph in the AI Engine partition
auto ghdl=xrt::graph(hwctx_aie_pr2,"gr"); //"gr" is graph object name
ghdl.run(iterations);
std::cout<<"Graph run enqueue done"<<std::endl;
// Wait for PL kernels and graphs to be done
ghdl.wait();
std::cout<<"Graph done"<<std::endl;
s2mm_run.wait();
mm2s_run.wait();
std::cout<<"s2mm done"<<std::endl;
//sync output buffer
out_bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
//Post-process ......
When reloading an AI Engine partition, you can use either of the following:
- The same XCLBIN (same graph), or
- A different XCLBIN (different graph with the same AI Engine-PL and AI Engine-NoC interfaces).
Before reloading the AI Engine partition, ensure that the PL kernels are quiesced to the partition. Also ensure the kernels are in an idle state (that is, no data is transferring to the AI Engine partition).
Following are the two supported approaches for reloading AI Engine partitions:
- Exit and restart the application. This resets and reloads all AI Engine partitions.
- Reload the AI Engine partition without exiting the application. You can do this using C++
newanddeleteto explicitly allocate and destroy the hardware context (and any objects created on it) before each reload. Alternatively use C++ local scoping to ensure that the AI Engine hardware context and its associated operations are automatically destroyed between reloads.
The following code is an example of using C++ new and delete to explicitly allocate
and destroy the hardware context between reloads:
auto device = xrt::device(0); //device index=0
std::cout<<"Open device successfully"<<std::endl;
std::string xclbinFilename = argv[1];
//xclbin related to partition 0
auto xclbin_1 = xrt::xclbin(xclbinFilename);
auto uuid1 = device.register_xclbin(xclbin_1);
xrt::hw_context *hwctx_1=new xrt::hw_context(device, uuid1);
std::cout<<"Load XCLBIN successfully"<<std::endl;
auto din_buffer = new xrt::aie::bo (*hwctx_1, BLOCK_SIZE_in_Bytes,xrt::bo::flags::normal, /*memory group*/0); //Only non-cacheable buffer is supported
int* dinArray= din_buffer->map<int*>();
auto dout_buffer = new xrt::aie::bo (*hwctx_1, BLOCK_SIZE_in_Bytes,xrt::bo::flags::normal, /*memory group*/0); //Only non-cacheable buffer is supported
int* doutArray= dout_buffer->map<int*>();
int* doutRef=(int*)malloc(BLOCK_SIZE_in_Bytes);
std::cout<<"Allocate buffer completed"<<std::endl;
//Pre-processing on input data
......
auto ghdl=new xrt::graph(*hwctx_1,"gr");
std::cout<<"Open partition 0 with graph1 successfully"<<std::endl;
xrt::aie::buffer *bufIn=new xrt::aie::buffer(*hwctx_1, "gr.gmioIn");
//memory group is 0, depending on the platform
bufIn->async(*din_buffer, XCL_BO_SYNC_BO_GMIO_TO_AIE, BLOCK_SIZE_in_Bytes, 0);
ghdl->run(ITERATION);
xrt::aie::buffer *bufOut=new xrt::aie::buffer(*hwctx_1, "gr.gmioOut");
//memory group is 0, depending on the platform
bufOut->async(*dout_buffer, XCL_BO_SYNC_BO_AIE_TO_GMIO, BLOCK_SIZE_in_Bytes, 0);
//PS can do other tasks here when data is transferring
std::cout<<"Waiting for graph to be completed"<<std::endl;
bufOut->wait();
std::cout<<"GMIO transactions finished"<<std::endl;
ghdl->end();
//Post-processing on output data
......
//delete the objects that is associated to the hw context, and then delete the hw context.
delete bufIn;
delete bufOut;
delete din_buffer;
delete dout_buffer;
delete ghdl;
delete hwctx_1;
std::cout<<"Make sure the context is deleted successfully before starting again."<<std::endl;
std::cout<<"reloading partition 0 with graph2"<<std::endl;
//xclbin related to partition 0
std::string xclbinFilename2 = argv[2];
std::cout<<"xclbin file name:"<<xclbinFilename2<<std::endl;
auto xclbin_2 = xrt::xclbin(xclbinFilename2);
auto uuid2 = device.register_xclbin(xclbin_2);
xrt::hw_context *hwctx_2=new xrt::hw_context(device, uuid2);
std::cout<<"Load XCLBIN2 successfully"<<std::endl;
//memory group is 0, depending on the platform
auto din_buffer2 = new xrt::aie::bo (*hwctx_2, BLOCK_SIZE_in_Bytes,xrt::bo::flags::normal, 0); //Only non-cacheable buffer is supported
int* dinArray2 = din_buffer2->map<int*>();
//memory group is 0, depending on the platform
auto dout_buffer2 = new xrt::aie::bo (*hwctx_2, BLOCK_SIZE_in_Bytes,xrt::bo::flags::normal, 0); //Only non-cacheable buffer is supported
int* doutArray2= dout_buffer2->map<int*>();
std::cout<<"Allocate buffer2 completed"<<std::endl;
//Pre-processing
......
auto ghdl2=new xrt::graph(*hwctx_2,"gr");
std::cout<<"Open pr0 graph2 successfully"<<std::endl;
xrt::aie::buffer *bufIn2=new xrt::aie::buffer(*hwctx_2, "gr.gmioIn");
bufIn2->async(*din_buffer2, XCL_BO_SYNC_BO_GMIO_TO_AIE, BLOCK_SIZE_in_Bytes, 0);
ghdl2->run(ITERATION);
xrt::aie::buffer *bufOut2=new xrt::aie::buffer(*hwctx_2, "gr.gmioOut");
bufOut2->async(*dout_buffer2, XCL_BO_SYNC_BO_AIE_TO_GMIO, BLOCK_SIZE_in_Bytes, 0);
std::cout<<"Waiting for graph to be completed"<<std::endl;
bufOut2->wait();
std::cout<<"GMIO transactions finished"<<std::endl;
ghdl2->end();
//Post-processing
......
delete bufIn2;
delete bufOut2;
delete din_buffer2;
delete dout_buffer2;
delete ghdl2;
delete hwctx_2;
The following code provides an example of leveraging C++ local scoping (where the sub-function serves as the local scope) between reloads:
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <unistd.h>
#include <math.h>
#include "xrt/xrt_kernel.h"
#include "xrt/xrt_graph.h"
const int ITERATION=4;
const int ELEM_per_iter=256;
int run(char* xclbinFilename_pl, char* xclbinFilename_aie, int rtp_type)
{
size_t output_size_in_bytes = ELEM_per_iter*4*ITERATION;
int ret;
// Open xclbin
auto dhdl = xrt::device(0); //device index=0
auto xclbin_pl = xrt::xclbin(std::string(xclbinFilename_pl));
auto uuid_pl = dhdl.register_xclbin(xclbin_pl);
xrt::hw_context hwctx_pl{dhdl, uuid_pl};
auto xclbin_aie = xrt::xclbin(std::string(xclbinFilename_aie));
auto uuid_aie = dhdl.register_xclbin(xclbin_aie);
xrt::hw_context hwctx_aie{dhdl, uuid_aie};
// s2mm & datagen kernel handle
auto s2mm = xrt::kernel(hwctx_pl, "s2mm:{s2mm_1}");
auto datagen = xrt::kernel(hwctx_pl, "datagen");
// output memory
auto out_bo = xrt::bo(hwctx_pl, output_size_in_bytes,static_cast<xrt::bo::flags>(0),s2mm.group_id(0));
auto host_out=out_bo.map<int*>();
//kernel run
auto s2mm_run = s2mm(out_bo, nullptr, ELEM_per_iter*ITERATION);//1st run for s2mm has started
auto datagen_run = datagen(nullptr, ELEM_per_iter*ITERATION,0);
auto ghdl=xrt::graph(hwctx_aie,"gr");
ghdl.run(ITERATION);
int value[16]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
if(rtp_type==0)
ghdl.update("gr.k.in[1]",10);
else if(rtp_type==1)
ghdl.update("gr.k.in[1]",value);
ghdl.end();
s2mm_run.wait();
out_bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
int match = 0;
int base=10;
for (int i = 0; i < ITERATION; i++)
{
for(int j=0;j<ELEM_per_iter;j++)
{
if(rtp_type==0)
{
if(host_out[ELEM_per_iter*i+j]!=base+i*ELEM_per_iter+j)
match=1;
}else if(rtp_type==1)
{
if(host_out[ELEM_per_iter*i+j]!=value[j%16]+i*ELEM_per_iter+j)
match=1;
}
}
}
return match;
}
int main(int argc, char* argv[])
{
try
{
if(argc != 3 && argc != 4)
{
std::cout << "Usage: " << argv[0] <<" <pl.xclbin> <aie_only.xclbin> <aie_only_second.xclbin>" << std::endl;
return EXIT_FAILURE;
}
char* xclbinFilename_pl = argv[1];
char* xclbinFilename_aie = argv[2];
auto match = run(xclbinFilename_pl, xclbinFilename_aie,0);
std::cout << "TEST PR1 Graph0 " << (match ? "FAILED" : "PASSED") << "\n";
if(argc==4)
{ //Do second xclbin reload
xclbinFilename_aie = argv[3];
match = run(xclbinFilename_pl, xclbinFilename_aie,1);
std::cout << "TEST PR1 SECOND GRAPH " << (match ? "FAILED" : "PASSED") << "\n";
}
return (match ? EXIT_FAILURE : EXIT_SUCCESS);
}
catch (std::exception const& e)
{
std::cout << "Exception: " << e.what() << "\n";
std::cout << "FAILED TEST\n";
return 1;
}
}