As described in Synthesis Summary, Vitis HLS issues a report summarizing burst activities and also identifying burst failures. If bursts of variable lengths are done, then the report will mention that bursts of variable lengths were inferred. The compiler also provides burst messages that can be found in the compiler log, vitis_hls.log. These messages are issued before the scheduling step.
Simple Read/Write Burst Inference
INFO: [HLS 214-115] Burst read of variable length and bit width 32 has been inferred on port 'gmem'
INFO: [HLS 214-115] Burst write of variable length and bit width 32 has been inferred on port 'gmem' (./src/vadd.cpp:75:9).
The code for this example follows:
/****** BEGIN EXAMPLE *******/
#define DATA_SIZE 2048
// Define internal buffer max size
#define BURSTBUFFERSIZE 256
//TRIPCOUNT identifiers
const unsigned int c_min = 1;
const unsigned int c__max = BURSTBUFFERSIZE;
const unsigned int c_chunk_sz = DATA_SIZE;
extern "C" {
void vadd(int *a, int size, int inc_value) {
// Map pointer a to AXI4-master interface for global memory access
#pragma HLS INTERFACE mode=m_axi port=a offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
// We also need to map a and return to a bundled axilite slave interface
#pragma HLS INTERFACE mode=s_axilite port=a bundle=control
#pragma HLS INTERFACE mode=s_axilite port=size bundle=control
#pragma HLS INTERFACE mode=s_axilite port=inc_value bundle=control
#pragma HLS INTERFACE mode=s_axilite port=return bundle=control
int burstbuffer[BURSTBUFFERSIZE];
// Per iteration of this loop perform BURSTBUFFERSIZE vector addition
for (int i = 0; i < size; i += BURSTBUFFERSIZE) {
#pragma HLS LOOP_TRIPCOUNT min=c_min*c_min max=c_chunk_sz*c_chunk_sz/(c_max*c_max)
int chunk_size = BURSTBUFFERSIZE;
//boundary checks
if ((i + BURSTBUFFERSIZE) > size)
chunk_size = size - i;
// memcpy creates a burst access to memory
// multiple calls of memcpy cannot be pipelined and will be scheduled sequentially
// memcpy requires a local buffer to store the results of the memory transaction
memcpy(burstbuffer, &a[i], chunk_size * sizeof(int));
// Calculate and write results to global memory, the sequential write in a for loop can be
// inferred as a memory burst access
calc_write:
for (int j = 0; j < chunk_size; j++) {
#pragma HLS LOOP_TRIPCOUNT min=c_size_max max=c_chunk_sz
#pragma HLS PIPELINE II=1
burstbuffer[j] = burstbuffer[j] + inc_value;
a[i + j] = burstbuffer[j];
}
}
}
Pipelining Between Bursts
for(int x=0; x < k; ++x) {
int off = f(x);
for(int i = 0; i < N; ++i) {
#pragma HLS PIPELINE II=1
... = gmem[off + i];
}
}
But notice that the outer loop is not pipelined. This means that while there is pipelining inside bursts, there won't be any pipelining between bursts.
for(int x=0; x < k; ++x) {
#pragma HLS PIPELINE II=N
int off = f(x);
for(int i = 0; i < N; ++i) {
#pragma HLS UNROLL
... = gmem[off + i];
}
}
Accessing Row Data from a Two-Dimensional Array
INFO: [HLS 214-115] Burst read of length 256 and bit width 512 has been inferred on port 'gmem' (./src/row_array_2d.cpp:43:5)
INFO: [HLS 214-115] Burst write of length 256 and bit width 512 has been inferred on port 'gmem' (./src/row_array_2d.cpp:56:5)
Notice that a bit width of 512 is achieved in this example. This is more efficient than the 32 bit width achieved in the simple example above. Bursting wider bit widths is another way bursts can be optimized as discussed in Automatic Port Width Resizing.
The code for this example follows:
/****** BEGIN EXAMPLE *******/
// Parameters Description:
// NUM_ROWS: matrix height
// WORD_PER_ROW: number of words in a row
// BLOCK_SIZE: number of words in an array
#define NUM_ROWS 64
#define WORD_PER_ROW 64
#define BLOCK_SIZE (WORD_PER_ROW*NUM_ROWS)
// Default datatype is integer
typedef int DTYPE;
typedef hls::stream<DTYPE> my_data_fifo;
// Read data function: reads data from global memory
void read_data(DTYPE *inx, my_data_fifo &inFifo) {
read_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
read_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
inFifo << inx[WORD_PER_ROW * i + jj];
;
}
}
}
// Write data function - writes results to global memory
void write_data(DTYPE *outx, my_data_fifo &outFifo) {
write_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
write_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
outFifo >> outx[WORD_PER_ROW * i + jj];
}
}
}
// Compute function is pretty simple because this example is focused on efficient
// memory access pattern.
void compute(my_data_fifo &inFifo, my_data_fifo &outFifo, int alpha) {
compute_loop_i:
for (int i = 0; i < NUM_ROWS; ++i) {
compute_loop_jj:
for (int jj = 0; jj < WORD_PER_ROW; ++jj) {
#pragma HLS PIPELINE II=1
DTYPE inTmp;
inFifo >> inTmp;
DTYPE outTmp = inTmp * alpha;
outFifo << outTmp;
}
}
}
extern "C" {
void row_array_2d(DTYPE *inx, DTYPE *outx, int alpha) {
// AXI master interface
#pragma HLS INTERFACE mode=m_axi port = inx offset = slave bundle = gmem
#pragma HLS INTERFACE mode=m_axi port = outx offset = slave bundle = gmem
// AXI slave interface
#pragma HLS INTERFACE mode=s_axilite port = inx bundle = control
#pragma HLS INTERFACE mode=s_axilite port = outx bundle = control
#pragma HLS INTERFACE mode=s_axilite port = alpha bundle = control
#pragma HLS INTERFACE mode=s_axilite port = return bundle = control
my_data_fifo inFifo;
// By default the FIFO depth is 2, user can change the depth by using
// #pragma HLS stream variable=inFifo depth=256
my_data_fifo outFifo;
// Dataflow enables task level pipelining, allowing functions and loops to execute
// concurrently. For more details please refer to UG902.
#pragma HLS DATAFLOW
// Read data from each row of 2D array
read_data(inx, inFifo);
// Do computation with the acquired data
compute(inFifo, outFifo, alpha);
// Write data to each row of 2D array
write_data(outx, outFifo);
return;
}
}