Use the dataflow pragma for concurrently scheduling the three functions inp_A, inp_B, and out_C.
int dma_hls(
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A0,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A1,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A2,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A3,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A4,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A5,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A6,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_A7,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B0,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B1,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B2,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B3,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B4,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B5,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B6,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B7,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B8,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B9,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B10,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B11,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B12,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B13,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B14,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B15,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B16,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B17,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B18,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B19,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B20,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B21,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B22,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmOut_to_B23,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmInp_from_C0,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmInp_from_C1,
hls::stream<ap_axiu<128, 0, 0, 0>> &strmInp_from_C2,
ap_int<32> matSz_A, ap_int<32> matSz_B, ap_int<32> matSz_C
)
{
#pragma HLS INTERFACE axis port=strmOut_to_A0
#pragma HLS INTERFACE axis port=strmOut_to_A1
#pragma HLS INTERFACE axis port=strmOut_to_A2
#pragma HLS INTERFACE axis port=strmOut_to_A3
#pragma HLS INTERFACE axis port=strmOut_to_A4
#pragma HLS INTERFACE axis port=strmOut_to_A5
#pragma HLS INTERFACE axis port=strmOut_to_A6
#pragma HLS INTERFACE axis port=strmOut_to_A7
#pragma HLS INTERFACE axis port=strmOut_to_B0
#pragma HLS INTERFACE axis port=strmOut_to_B1
#pragma HLS INTERFACE axis port=strmOut_to_B2
#pragma HLS INTERFACE axis port=strmOut_to_B3
#pragma HLS INTERFACE axis port=strmOut_to_B4
#pragma HLS INTERFACE axis port=strmOut_to_B5
#pragma HLS INTERFACE axis port=strmOut_to_B6
#pragma HLS INTERFACE axis port=strmOut_to_B7
#pragma HLS INTERFACE axis port=strmOut_to_B8
#pragma HLS INTERFACE axis port=strmOut_to_B9
#pragma HLS INTERFACE axis port=strmOut_to_B10
#pragma HLS INTERFACE axis port=strmOut_to_B11
#pragma HLS INTERFACE axis port=strmOut_to_B12
#pragma HLS INTERFACE axis port=strmOut_to_B13
#pragma HLS INTERFACE axis port=strmOut_to_B14
#pragma HLS INTERFACE axis port=strmOut_to_B15
#pragma HLS INTERFACE axis port=strmOut_to_B16
#pragma HLS INTERFACE axis port=strmOut_to_B17
#pragma HLS INTERFACE axis port=strmOut_to_B18
#pragma HLS INTERFACE axis port=strmOut_to_B19
#pragma HLS INTERFACE axis port=strmOut_to_B20
#pragma HLS INTERFACE axis port=strmOut_to_B21
#pragma HLS INTERFACE axis port=strmOut_to_B22
#pragma HLS INTERFACE axis port=strmOut_to_B23
#pragma HLS INTERFACE axis port=strmInp_from_C0
#pragma HLS INTERFACE axis port=strmInp_from_C1
#pragma HLS INTERFACE axis port=strmInp_from_C2
#pragma HLS INTERFACE s_axilite port=matSz_A bundle=control
#pragma HLS INTERFACE s_axilite port=matSz_B bundle=control
#pragma HLS INTERFACE s_axilite port=matSz_C bundle=control
//#pragma HLS INTERFACE s_axilite port=iterCnt bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
#pragma HLS DATAFLOW
ap_uint<21> errCnt = 0;
ap_uint<128> goldenVal_ZP = ap_uint<128> \
("0x00000000000000000000000000000000", 16);
#if GEMM_SIZE == 32
ap_uint<128> goldenVal = ap_uint<128> \
("0x00400040004000400040004000400040", 16);
#elif GEMM_SIZE == 64
ap_uint<128> goldenVal = ap_uint<128> \
("0x00800080008000800080008000800080", 16);
#elif GEMM_SIZE == 128
ap_uint<128> goldenVal = ap_uint<128> \
("0x01000100010001000100010001000100", 16);
#elif GEMM_SIZE == 256
ap_uint<128> goldenVal = ap_uint<128> \
("0x02000200020002000200020002000200", 16);
#elif GEMM_SIZE == 512
ap_uint<128> goldenVal = ap_uint<128> \
("0x04000400040004000400040004000400", 16);
#elif GEMM_SIZE == 1024
ap_uint<128> goldenVal = ap_uint<128> \
("0x08000800080008000800080008000800", 16);
#endif
inp_A(strmOut_to_A0, strmOut_to_A1, strmOut_to_A2, strmOut_to_A3, strmOut_to_A4, strmOut_to_A5, strmOut_to_A6, strmOut_to_A7,
matSz_A);
inp_B(strmOut_to_B0, strmOut_to_B1, strmOut_to_B2, strmOut_to_B3,
strmOut_to_B4, strmOut_to_B5, strmOut_to_B6, strmOut_to_B7,
strmOut_to_B8, strmOut_to_B9, strmOut_to_B10, strmOut_to_B11,
strmOut_to_B12, strmOut_to_B13, strmOut_to_B14, strmOut_to_B15,
strmOut_to_B16, strmOut_to_B17, strmOut_to_B18, strmOut_to_B19,
strmOut_to_B20, strmOut_to_B21, strmOut_to_B22, strmOut_to_B23,
matSz_B);
out_C(strmInp_from_C0, strmInp_from_C1, strmInp_from_C2, matSz_C, errCnt, goldenVal, goldenVal_ZP);
return errCnt;
}
The dma_hls kernel also specifies HLS pragmas to help optimize the kernel code and adhere to interface protocols. Refer to this page for detailed documentation of all HLS pragmas. The following table provides a summary of the HLS pragmas used in the kernel.
Switch |
Description |
|---|---|
#pragma HLS INTERFACE |
In C/C++ code, all input and output operations happen in zero time through formal function arguments. In a RTL design, these same input and output operations must happen through a port in the design interface. These operations typically use a specific input/output (I/O) protocol. For more information, refer to this page. |
#pragma HLS PIPELINE II=1 |
Reduces the initiation interval (II) for a function or loop by allowing the concurrent execution of operations. The |
#pragma HLS dataflow |
The DATAFLOW pragma enables task-level pipelining. It allows functions and loops to overlap in their operation. This increases the concurrency of the RTL implementation and increases the overall throughput of the design. Refer to this page for more information. |
#pragma HLS loop_tripcount |
When manually applied to a loop, specifies the total number of iterations performed by a loop. The |