The following sample application is to demonstrate usage of LPGEMM downscale API with multiple scale post-ops and int4 to int8 B matrix reordering:
/*
$gcc test_lpgemm.c -o ./test_lpgemm.x -I/aocl-blis_install_directory/include/amdzen/
-L/aocl-blis_install_directory/lib/amdzen/ -lblis-mt -lm
Note: Export blis library path to LD_LIBRARY_PATH before running the
executable ./test_lpgem.x
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "blis.h"
// aocl_gemm_u8s8s32os8 (A:uint8_t, B:int_t, C:int8_t) used here.
// 3 post-ops - scale + matrix_add + scale used here.
int main()
{
dim_t m = 1024;
dim_t n = 1024;
dim_t k = 1024;
// Leading dimensions for row major matrices.
dim_t lda = k;
dim_t ldb = n;
dim_t ldc = n;
err_t err = BLIS_SUCCESS;
uint8_t *a = (uint8_t *)bli_malloc_user(sizeof(uint8_t) * m * k, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// int4_t B matrix represented using int8_t, but with half the int8_t size.
int8_t *b = (int8_t *)bli_malloc_user((sizeof(int8_t) * n * k) / 2, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
int8_t *c = (int8_t *)bli_malloc_user(sizeof(int8_t) * m * n, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Functions to fill the matrices with data can be added here.
int32_t alpha = 2;
int32_t beta = 9;
char storage = 'r'; // Row major. Use 'c' for column major.
char transa = 'n'; // No transpose. Transpose not supported.
char transb = 'n';
char reordera = 'n';
char reorderb = 'r';
// Initialize post-ops struct.
aocl_post_op *post_ops = NULL;
post_ops = (aocl_post_op *)bli_malloc_user(sizeof(aocl_post_op), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Downscale parameters need to be passed as a post-op, even
// if a downscale specific api is invoked.
dim_t max_post_ops_seq_length = 3; // scale + matrix_add + scale
post_ops->seq_vector =
(AOCL_POST_OP_TYPE *) bli_malloc_user(
max_post_ops_seq_length * sizeof(AOCL_POST_OP_TYPE),
&err);
if (err != BLIS_SUCCESS) { goto bailout; }
// 2 scaling post-ops, first for normal scaling and second one for
// downscaling, need to allocate scale struct dynamically.
post_ops->sum =
bli_malloc_user(2 * sizeof(aocl_post_op_sum), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// For first scale, using scalar zero point and scale factor.
post_ops->seq_vector[0] = SCALE;
(post_ops->sum + 0)->is_power_of_2 = FALSE;
(post_ops->sum + 0)->buff = NULL;
(post_ops->sum + 0)->zero_point =
bli_malloc_user(1 * sizeof(int8_t), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
*((int8_t*)((post_ops->sum + 0)->zero_point)) = 3;
(post_ops->sum + 0)->zero_point_len = 1;
(post_ops->sum + 0)->zp_stor_type = AOCL_GEMM_INT8;
(post_ops->sum + 0)->scale_factor =
bli_malloc_user(1 * sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
*((float*)((post_ops->sum + 0)->scale_factor)) = 3.9;
(post_ops->sum + 0)->scale_factor_len = 1;
// Matrix add post-op.
post_ops->matrix_add =
bli_malloc_user(1 * sizeof(aocl_post_op_matrix_add), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
post_ops->seq_vector[1] = MATRIX_ADD;
(post_ops->matrix_add + 0)->matrix =
bli_malloc_user(sizeof(int8_t) * m * n, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
(post_ops->matrix_add + 0)->ldm = n;
(post_ops->matrix_add + 0)->stor_type = AOCL_GEMM_INT8;
(post_ops->matrix_add + 0)->scale_factor =
bli_malloc_user(1 * sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
*((float*)((post_ops->matrix_add + 0)->scale_factor)) = 1.9;
(post_ops->matrix_add + 0)->scale_factor_len = 1;
// Add function to fill matrix_add matrix here.
// For second scale, using vector zero point and scale factor.
// This scale post-op is purely for downscaling/quantization.
post_ops->seq_vector[2] = SCALE;
(post_ops->sum + 1)->is_power_of_2 = FALSE;
(post_ops->sum + 1)->buff = NULL;
(post_ops->sum + 1)->zero_point =
bli_malloc_user(n * sizeof(int8_t), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
(post_ops->sum + 1)->zero_point_len = n;
(post_ops->sum + 1)->zp_stor_type = AOCL_GEMM_INT8;
(post_ops->sum + 1)->scale_factor =
bli_malloc_user(n * sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
(post_ops->sum + 1)->scale_factor_len = n;
// Add function to fill zero point and scale factor here.
post_ops->seq_length = 3;
// Reorder B matrix, this is pre-packing the B matrix so that packing
// costs are not incurred when executing GEMM. Here the int4 B matrix
// is reordered along with conversion to each element to int8 type.
siz_t b_reorder_buffer_size =
aocl_get_reorder_buf_size_u8s4s32os32(storage, transb, 'B', k, n );
int8_t* b_reorder = (int8_t*)bli_malloc_user(b_reorder_buffer_size, &err);
aocl_reorder_u8s4s32os32(storage, transb, 'B',
b, b_reorder,
k, n, ldb);
aocl_gemm_u8s8s32os8(storage, transa, transb,
m, n, k,
alpha,
a, lda, reordera,
b_reorder, ldb, reorderb,
beta,
c, ldc,
post_ops);
bailout:
if (post_ops->sum != NULL)
{
if ((post_ops->sum + 0)->zero_point != NULL)
{
bli_free_user((post_ops->sum + 0)->zero_point);
}
if ((post_ops->sum + 0)->scale_factor != NULL)
{
bli_free_user((post_ops->sum + 0)->scale_factor);
}
if ((post_ops->sum + 1)->zero_point != NULL)
{
bli_free_user((post_ops->sum + 1)->zero_point);
}
if ((post_ops->sum + 1)->scale_factor != NULL)
{
bli_free_user((post_ops->sum + 1)->scale_factor);
}
bli_free_user(post_ops->sum);
}
if (post_ops->matrix_add != NULL)
{
if ((post_ops->matrix_add + 0)->matrix != NULL)
{
bli_free_user((post_ops->matrix_add + 0)->matrix);
}
if ((post_ops->matrix_add + 0)->scale_factor != NULL)
{
bli_free_user((post_ops->matrix_add + 0)->scale_factor);
}
bli_free_user(post_ops->matrix_add);
}
if (post_ops->seq_vector != NULL)
{
bli_free_user(post_ops->seq_vector);
}
if (post_ops != NULL)
{
bli_free_user(post_ops);
}
if (b_reorder != NULL)
{
bli_free_user(b_reorder);
}
if (a != NULL)
{
bli_free_user(a);
}
if (b != NULL)
{
bli_free_user(b);
}
if (c != NULL)
{
bli_free_user(c);
}
return 0;
}