The following sample application is to demonstrate usage of LPGEMM API with reordered B matrix and post-ops:
/*
$gcc test_lpgemm.c -o ./test_lpgemm.x -I/aocl-blis_install_directory/include/amdzen/
-L/aocl-blis_install_directory/lib/amdzen/ -lblis-mt -lm
Note: Export blis library path to LD_LIBRARY_PATH before running the
executable ./test_lpgem.x
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "blis.h"
// aocl_gemm_bf16bf16f32of32 (A:bfloat16, B:bfloat16, C:float) used here.
// 3 post-ops - bias + gelu_tanh + clip used here.
int main()
{
dim_t m = 1024;
dim_t n = 1024;
dim_t k = 1024;
// Leading dimensions for row major matrices.
dim_t lda = k;
dim_t ldb = n;
dim_t ldc = n;
err_t err = BLIS_SUCCESS;
bfloat16 *a = (bfloat16 *)bli_malloc_user(sizeof(bfloat16) * m * k, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
bfloat16 *b = (bfloat16 *)bli_malloc_user(sizeof(bfloat16) * n * k, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
float *c = (float *)bli_malloc_user(sizeof(float) * m * n, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Functions to fill the matrices with data can be added here.
float alpha = 2.95;
float beta = 3.5;
char storage = 'r'; // Row major. Use 'c' for column major.
char transa = 'n'; // No transpose. Transpose not supported.
char transb = 'n';
char reordera = 'n';
char reorderb = 'r'; // B matrix will be reordered.
// Initialize post-ops struct.
aocl_post_op *post_ops = NULL;
post_ops = (aocl_post_op *)bli_malloc_user(sizeof(aocl_post_op), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
dim_t max_post_ops_seq_length = 3; // bias + gelu_tanh + clip
post_ops->seq_vector =
(AOCL_POST_OP_TYPE *) bli_malloc_user(
max_post_ops_seq_length * sizeof(AOCL_POST_OP_TYPE),
&err);
if (err != BLIS_SUCCESS) { goto bailout; }
// 1 bias instance, need to allocate dynamically.
post_ops->seq_vector[0] = BIAS;
post_ops->bias =
bli_malloc_user(1 * sizeof(aocl_post_op_bias), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Need to output accumulation (float) type for bias.
(post_ops->bias + 0)->bias = bli_malloc_user(n * sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Add function to fill bias array here.
// 2 element wise post-ops, need to allocate dynamically.
post_ops->seq_vector[1] = ELTWISE; // For gelu_tanh
post_ops->seq_vector[2] = ELTWISE; // For clip
post_ops->eltwise =
bli_malloc_user(2 * sizeof(aocl_post_op_eltwise), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Gelu tanh.
(post_ops->eltwise + 0)->is_power_of_2 = FALSE;
(post_ops->eltwise + 0)->scale_factor = NULL;
(post_ops->eltwise + 0)->algo.alpha = NULL;
(post_ops->eltwise + 0)->algo.beta = NULL;
(post_ops->eltwise + 0)->algo.algo_type = GELU_TANH;
// Clip.
(post_ops->eltwise + 1)->is_power_of_2 = FALSE;
(post_ops->eltwise + 1)->scale_factor = NULL;
// Min bound is represented by alpha.
(post_ops->eltwise + 1)->algo.alpha =
bli_malloc_user(sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Max bound is represented by beta.
(post_ops->eltwise + 1)->algo.beta =
bli_malloc_user(sizeof(float), &err);
if (err != BLIS_SUCCESS) { goto bailout; }
// Set some min/max bounds.
*((float*)(post_ops->eltwise + 1)->algo.alpha) = -64.5;
*((float*)(post_ops->eltwise + 1)->algo.beta) = 3.9;
(post_ops->eltwise + 1)->algo.algo_type = CLIP;
post_ops->seq_length = 3;
// Reorder B matrix, this is pre-packing the B matrix so that packing
// costs are not incurred when executing GEMM.
siz_t b_reorder_buffer_size =
aocl_get_reorder_buf_size_bf16bf16f32of32(storage, transb, 'B', k, n );
bfloat16* b_reorder =
(bfloat16*)bli_malloc_user(b_reorder_buffer_size, &err);
if (err != BLIS_SUCCESS) { goto bailout; }
aocl_reorder_bf16bf16f32of32(storage, transb, 'B',
b, b_reorder,
k, n, ldb);
aocl_gemm_bf16bf16f32of32(storage, transa, transb,
m, n, k,
alpha,
a, lda, reordera,
b_reorder, ldb, reorderb,
beta,
c, ldc,
post_ops);
bailout:
if ((post_ops->eltwise + 1)->algo.alpha != NULL)
{
bli_free_user((post_ops->eltwise + 1)->algo.alpha);
}
if ((post_ops->eltwise + 1)->algo.beta != NULL)
{
bli_free_user((post_ops->eltwise + 1)->algo.beta);
}
if (post_ops->eltwise != NULL)
{
bli_free_user(post_ops->eltwise);
}
if (post_ops->bias != NULL)
{
if ((post_ops->bias + 0)->bias != NULL)
{
bli_free_user((post_ops->bias + 0)->bias);
}
bli_free_user(post_ops->bias);
}
if (post_ops->seq_vector != NULL)
{
bli_free_user(post_ops->seq_vector);
}
if (post_ops != NULL)
{
bli_free_user(post_ops);
}
if (b_reorder != NULL)
{
bli_free_user(b_reorder);
}
if (a != NULL)
{
bli_free_user(a);
}
if (b != NULL)
{
bli_free_user(b);
}
if (c != NULL)
{
bli_free_user(c);
}
return 0;
}