The first processing loop of the kernel is shown below. Note that iterators are defined for work buffers specified in data memory to hold intermediate results. The processing loop reads input values for softmax computation and stores them to memory while searching for the maximum input value.
// work buffers in data memory
auto pWbufA = aie::begin_restrict_vector<8>(wbufA);
auto pWbufB = aie::begin_restrict_vector<8>(wbufB);
auto pWbufC = aie::begin_restrict_vector<8>(wbufC);
auto pWbufD = aie::begin_restrict_vector<8>(wbufD);
// read and store data while searching for maximum value
float max_val = -2 ^ 127;
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
aie::vector<float,8> vin = aie::vector_cast<float>(readincr_v<8>(in));
float vmax = aie::reduce_max(vin);
if (vmax > max_val) {
max_val = vmax;
}
*pWbufB++ = vin;
}
pWbufB -= (BUFFSZ/8);
chess_separator();
The next segment of kernel code, as shown below, is comprised of the first three computational loops used to evaluate the exponential function of all inputs. The first loop subtracts the maximum value from all inputs then multiplies the result by a scale factor of $\log_{2}e\(, which is equivalent to \)\frac{1}{\log(2)}$. The scale factor is defined in the kernel header file. The second loop computes floor() of the scaled values. Since conversion to fixed-point utilizes a rounding function, a value of 0.5 is subtracted first, to make the output equivalent to floor(). One caveat is that this does not compute floor() for very small, negative values. The third loop computes the fractional value $x - \lfloor x \rfloor$. Iterators for data memory are reset after each loop to prepare them for follow-on processing.
/****** Start of computation of exponential function of all input values ******/
// subtract maximum value from all input values and scale result by log2(e)
aie::accum<accfloat,8> acc_init;
acc_init.from_vector(aie::broadcast<float,8>(-log2e*max_val));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufA++ = aie::mac(acc_init, *pWbufB++, log2e);
}
pWbufA -= (BUFFSZ/8);
pWbufB -= (BUFFSZ/8);
chess_separator();
// compute integer part of scaled inputs, equivalent to floor()
// Note: Not strictly a floor(), fails for negative numbers with very small magnitudes
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
aie::vector<float,8> vecA = aie::vector_cast<float>(*pWbufA++);
aie::vector<float,8> xshft = aie::sub(vecA, 0.5f);
aie::vector<int32,8> xfloor = aie::to_fixed(xshft,0);
*pWbufC++ = aie::to_float(xfloor);
}
pWbufA -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
chess_separator();
// compute the fractional part of scaled input
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufB++ = aie::sub(*pWbufA++, *pWbufC++);
}
pWbufA -= (BUFFSZ/8);
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
chess_separator();
The next four loops are used to evaluate the correction polynomial using Horner’s method. Polynomial coefficients are specified in the kernel header file.
/****** Start of correction polynomial computation ******/
// using Horner's method, break polynomial evaluation into separate loops for improved pipelining
// polynomial loop 1
aie::accum<accfloat,8> p_acc;
p_acc.from_vector(aie::broadcast<float,8>(p3));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufC++ = aie::mac(p_acc, *pWbufB++, p4);
}
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
chess_separator();
// polynomial loop 2
p_acc.from_vector(aie::broadcast<float,8>(p2));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufD++ = aie::mac(p_acc, *pWbufB++, *pWbufC++);
}
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
pWbufD -= (BUFFSZ/8);
chess_separator();
// polynomial loop 3
p_acc.from_vector(aie::broadcast<float,8>(p1));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufC++ = aie::mac(p_acc, *pWbufB++, *pWbufD++);
}
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
pWbufD -= (BUFFSZ/8);
chess_separator();
// polynomial loop 4
p_acc.from_vector(aie::broadcast<float,8>(p0));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufD++ = aie::mac(p_acc, *pWbufB++, *pWbufC++);
}
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
pWbufD -= (BUFFSZ/8);
/****** End of correction polynomial computation ******/
chess_separator();
Three more loops are used to complete evaluation of the exponential function of the inputs, as shown below. The first loop subtracts the correction polynomial result from the scaled input. The next loop scales and translates the result so that the exponent is aligned with IEEE 754 format and has the proper offset. Parameters used in computation are specified in the kernel header file. Finally, the integer part of the result is extracted and reinterpreted as a floating-point number. Before reinterpreting the number as floating-point, some checking is performed to ensure the int32 value is reasonable to represent a float in the range $[0, 1]$, and any outliers are set to zero.
// apply correction term to scaled input
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufB++ = aie::sub(*pWbufA++, *pWbufD++);
}
pWbufA -= (BUFFSZ/8);
pWbufB -= (BUFFSZ/8);
pWbufD -= (BUFFSZ/8);
chess_separator();
// convert results to IEEE 754 format - use 2 loops
aie::accum<accfloat,8> b_acc;
b_acc.from_vector(aie::broadcast<float,8>(exp_B));
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
*pWbufC++ = aie::mac(b_acc,*pWbufB++,exp_S);
}
pWbufB -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
chess_separator();
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
aie::vector<int32,8> exp_i = aie::to_fixed(*pWbufC++,0);
// integer values should be in the range [0, 1,065,353,216], find outliers and set to zero
aie::mask<8> msk_neg = aie::lt(exp_i,0);
aie::vector<int32,8> exp_bnd = aie::select(exp_i, aie::zeros<int32,8>(), msk_neg);
aie::mask<8> msk_pos = aie::gt(exp_bnd,1065353216);
exp_bnd = aie::select(exp_bnd, aie::zeros<int32,8>(), msk_pos);
*pWbufA++ = exp_bnd.cast_to<float>();
}
pWbufA -= (BUFFSZ/8);
pWbufC -= (BUFFSZ/8);
/****** End of computation of exponential functions of all input values ******/
chess_separator();
With the exponential function of all inputs computed, the softmax function is evaluated by the kernel code shown below. The first loop sums exponential values in individual vector lanes. Next, individual vector lanes are summed, and the scalar processor is invoked to compute a scale factor, which is the inverse of the sum. The final loop multiples all the exponential values by the scale factor and sends the result to output.
// accumulate all vectors to determine scale factor
auto vsum = aie::zeros<float,8>();
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
vsum = aie::add(vsum, *pWbufA++);
}
pWbufA -= (BUFFSZ/8);
chess_separator();
// compute inverse
float scl_fctr = aie::inv(aie::reduce_add(vsum));
// scale values and write to output
for (unsigned i=0; i < BUFFSZ/8; i++)
chess_prepare_for_pipelining
chess_loop_count(BUFFSZ/8)
{
aie::vector<float,8> vout = aie::mul(*pWbufA++, scl_fctr);
writeincr(out, aie::vector_cast<int32>(vout));
}