Example Usage - 2024.2 English - XD160

Vitis Libraries

Document ID
XD160
Release Date
2024-11-29
Version
2024.2 English

The L2 API can be found at Vitis_Libraries/graph/L2/include. A typical code for calling L2 APIs might look like this:

extern "C" void shortestPath_top(ap_uint<32>* config,
                                 ap_uint<512>* offset,
                                 ap_uint<512>* column,
                                 ap_uint<512>* weight,

                                 ap_uint<512>* ddrQue512,
                                 ap_uint<32>* ddrQue,

                                 ap_uint<512>* result512,
                                 ap_uint<32>* result,
                                 ap_uint<512>* pred512,
                                 ap_uint<32>* pred,
                                 ap_uint<8>* info) {
   const int depth_E = E;
   const int depth_V = V;

#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = config depth = 4
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = offset depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem1 port = column depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem2 port = weight depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue depth = depth_E*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue512 depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = info depth = 8
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result depth = depth_V*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred depth = depth_V*16

   xf::graph::singleSourceShortestPath<32, MAXOUTDEGREE>(config, offset, column, weight, ddrQue512, ddrQue, result512,
                                                         result, pred512, pred, info);
}

It is usually a wrapper function of APIs in Vitis_Libraries/graph/L3/lib. Following might be the code:

#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = config depth = 4
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 8 bundle = gmem0 port = offset depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem1 port = column depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 1 num_read_outstanding = \
   32 max_write_burst_length = 2 max_read_burst_length = 32 bundle = gmem2 port = weight depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue depth = depth_E*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 2 max_read_burst_length = 2 bundle = gmem3 port = ddrQue512 depth = depth_E
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = info depth = 8
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   32 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem4 port = result depth = depth_V*16
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred512 depth = depth_V
#pragma HLS INTERFACE m_axi offset = slave latency = 32 num_write_outstanding = 32 num_read_outstanding = \
   1 max_write_burst_length = 64 max_read_burst_length = 2 bundle = gmem5 port = pred depth = depth_V*16

These are the HLS pragmas of the interface. They are responsible for configuring the interface of the FPGA binaries and might be vary with Alveo board. For more information about these pragmas, refer to vitis HLS interface pragma.

The steps to compile the C/C++ code into FPGA binaries is in the Makefile of each testcase. It generally has the following two steps:

  1. v++ --compile to compile the C/C++ code into RTL code. A .xo file is generated in this step.
  2. v++ --link to link the .xo file into FPGA binaries. A .xclbin file is generated in this step.

For more information about compiling the HLS code, refer to here

The code to make use of the FPGA binaries is usually C/C++ code with OpenCL APIs and typically contains the following steps:

  1. Create the entire platform and OpenCL kernels
std::vector<cl::Device> devices = xcl::get_xil_devices();
cl::Device device = devices[0];
cl::Context context(device, NULL, NULL, NULL, &fail);
cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &fail);
cl::Program::Binaries xclBins = xcl::import_binary_file(xclbin_path);
devices.resize(1);
cl::Program program(context, devices, xclBins, NULL, &fail);
cl::Kernel shortestPath;
shortestPath = cl::Kernel(program, "shortestPath_top", &fail);
  1. Create CL::Buffers and decide which data needs to be transferred to FPGA devices and back to host machine.
std::vector<cl::Memory> ob_in;
cl::Buffer offset_buf = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
                        sizeof(ap_uint<32>) * (numVertices + 1), &mext_o[0]);
ob_in.push_back(offset_buf);

std::vector<cl::Memory> ob_out;
cl::Buffer result_buf = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
                        sizeof(float) * ((numVertices + 1023) / 1024) * 1024, &mext_o[6]);
ob_out.push_back(result_buf);
  1. Set arguments for FPGA OpenCL kernels
shortestPath.setArg(j++, config_buf);
shortestPath.setArg(j++, offset_buf);
shortestPath.setArg(j++, column_buf);
shortestPath.setArg(j++, weight_buf);
shortestPath.setArg(j++, ddrQue_buf);
shortestPath.setArg(j++, ddrQue_buf);
shortestPath.setArg(j++, result_buf);
shortestPath.setArg(j++, result_buf);
shortestPath.setArg(j++, pred_buf);
shortestPath.setArg(j++, pred_buf);
shortestPath.setArg(j++, info_buf);
  1. Set up event dependencies
std::vector<cl::Event> events_write(1);
std::vector<cl::Event> events_kernel(1);
std::vector<cl::Event> events_read(1);

q.enqueueMigrateMemObjects(ob_in, 0, nullptr, &events_write[0]);  // Transfer Host data to Device
q.enqueueTask(shortestPath, &events_write, &events_kernel[0]); // execution of the OpenCL kernels (FPGA binaries)
q.enqueueMigrateMemObjects(ob_out, 1, &events_kernel, &events_read[0]); // Transfer Device data to Host
  1. Run OpenCL tasks and execute FPGA binaries
q.finish()