For streaming input and output interfaces, when the performance is limited by the stream number, the AI Engine is able to use two streaming inputs or two streaming outputs in parallel, instead of one streaming input or output.
To use two parallel streams, the following macros can be used, where
idx1
and idx2
are the two streams. Add the __restrict
keyword to
stream ports to ensure they are optimized for parallel processing.
int READINCR(SS_rsrc1, input_stream<T> *idx1)
int READINCR(SS_rsrc2, input_stream<T> *idx2)
void WRITEINCR(MS_rsrc1, output_stream<T> *idx1, int val)
void WRITEINCR(MS_rsrc2, output_stream<T> *idx2, int val)
int READINCRT(SS_rsrc1, input_stream<T> *idx1, bool &tlast)
int READINCRT(SS_rsrc2, input_stream<T> *idx2, bool &tlast)
void WRITEINCRT(MS_rsrc1, output_stream<T> *idx1, int val, int tlast)
void WRITEINCRT(MS_rsrc2, output_stream<T> *idx2, int val, int tlast)
or 128 bits per four
cycles:
aie::vector<int32,4> READINCRW(WSS_rsrc1, input_stream<T> *idx1)
aie::vector<int32,4> READINCRW(WSS_rsrc2, input_stream<T> *idx2)
void WRITEINCRW(WMS_rsrc1, output_stream<T> *idx1, v4int32 val)
void WRITEINCRW(WMS_rsrc2, output_stream<T> *idx2, v4int32 val)
aie::vector<int32,4> READINCRWT(WSS_rsrc1, input_stream<T> *idx1, bool &tlast)
aie::vector<int32,4> READINCRWT(WSS_rsrc2, input_stream<T> *idx2, bool &tlast)
void WRITEINCRWT(WMS_rsrc1, output_stream<T> *idx1, val, int tlast)
void WRITEINCRWT(WMS_rsrc2, output_stream<T> *idx2, val, int tlast)
Following is a sample code to use two parallel input streams to achieve pipelining with interval 1. Meaning that two read, one write, and one add are in every cycle.
void simple( input_stream_int32 * __restrict data0,
input_stream_int32 * __restrict data1,
output_stream_int32 * __restrict out) {
for(int i=0; i<1024; i++)
chess_prepare_for_pipelining
{
int32_t d = READINCR(SS_rsrc1, data0) ;
int32_t e = READINCR(SS_rsrc2, data1) ;
WRITEINCR(MS_rsrc1,out,d+e);
}
}
Following is a vectorized version to read streams parallel, and
achieve best performance of the
loop:
void vect_mul(input_stream<int8>* __restrict data1, input_stream<int8>* __restrict data2,
output_stream<int8>* __restrict out){
for(int i=0;i<(2<<20-1);i++)
chess_prepare_for_pipelining
{
aie::vector<int32,4> va=READINCRW(WSS_rsrc1, data1);
aie::vector<int32,4> vb=READINCRW(WSS_rsrc2, data2);
aie::vector<int8,16> va_int8=va.cast_to<int8>();
aie::vector<int8,16> vb_int8=vb.cast_to<int8>();
auto vc=aie::mul(va_int8,vb_int8);
writeincr(out,vc.to_vector<int8>(0));
}
}
Another way to guide the tool to use parallel streams is to use
aie_stream_resource_in
and aie_stream_resource_out
annotations with different enumeration values,
like aie_stream_resource_in::a
and aie_stream_resource_in::b
for input streams. For
example:void vect_mul(input_stream<int8>* __restrict data1, input_stream<int8>* __restrict data2,
output_stream<int8>* __restrict out){
while(true)
chess_prepare_for_pipelining
chess_loop_range(8,)
{
aie::vector<int8,16> va_int8=readincr_v<16,aie_stream_resource_in::a>(data1);
aie::vector<int8,16> vb_int8=readincr_v<16,aie_stream_resource_in::b>(data2);
auto vc=aie::mul(va_int8,vb_int8);
writeincr<aie_stream_resource_out::a>(out,vc.to_vector<int8>(0));
va_int8=readincr_v<16,aie_stream_resource_in::a>(data1);
vb_int8=readincr_v<16,aie_stream_resource_in::b>(data2);
vc=aie::mul(va_int8,vb_int8);
writeincr(out,vc.to_vector<int8>(0));
}
}
Similarly, aie_stream_resource_out::a
and aie_stream_resource_out::b
can be used to denote two parallel output
streams.