4.7.10. Sample Application 2 - 5.2 English - 57404

AOCL User Guide (57404)

Document ID
57404
Release Date
2025-12-29
Version
5.2 English

The following sample application is to demonstrate usage of LPGEMM API with reordered B matrix and post-ops:

/*
$gcc test_lpgemm.c -o ./test_lpgemm.x -I/aocl-blis_install_directory/include/amdzen/
-L/aocl-blis_install_directory/lib/amdzen/ -lblis-mt -lm

Note: Export blis library path to LD_LIBRARY_PATH before running the
executable ./test_lpgem.x
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "blis.h"

// aocl_gemm_bf16bf16f32of32 (A:bfloat16, B:bfloat16, C:float) used here.
// 3 post-ops - bias + gelu_tanh + clip used here.
int main()
{
    dim_t m = 1024;
    dim_t n = 1024;
    dim_t k = 1024;

    // Leading dimensions for row major matrices.
    dim_t lda = k;
    dim_t ldb = n;
    dim_t ldc = n;

    err_t err = BLIS_SUCCESS;
    bfloat16 *a = (bfloat16 *)bli_malloc_user(sizeof(bfloat16) * m * k, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    bfloat16 *b = (bfloat16 *)bli_malloc_user(sizeof(bfloat16) * n * k, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    float *c = (float *)bli_malloc_user(sizeof(float) * m * n, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // Functions to fill the matrices with data can be added here.
    float alpha = 2.95;
    float beta = 3.5;
    char storage = 'r'; // Row major. Use 'c' for column major.
    char transa = 'n'; // No transpose. Transpose not supported.
    char transb = 'n';
    char reordera = 'n';
    char reorderb = 'r'; // B matrix will be reordered.

    // Initialize post-ops struct.
    aocl_post_op *post_ops = NULL;
    post_ops = (aocl_post_op *)bli_malloc_user(sizeof(aocl_post_op), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }


    dim_t max_post_ops_seq_length = 3; // bias + gelu_tanh + clip
    post_ops->seq_vector =
        (AOCL_POST_OP_TYPE *) bli_malloc_user(
                        max_post_ops_seq_length * sizeof(AOCL_POST_OP_TYPE),
                        &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // 1 bias instance, need to allocate dynamically.
    post_ops->seq_vector[0] = BIAS;
    post_ops->bias =
            bli_malloc_user(1 * sizeof(aocl_post_op_bias), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    // Need to output accumulation (float) type for bias.
    (post_ops->bias + 0)->bias = bli_malloc_user(n * sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    // Add function to fill bias array here.

    // 2 element wise post-ops, need to allocate dynamically.
    post_ops->seq_vector[1] = ELTWISE; // For gelu_tanh
    post_ops->seq_vector[2] = ELTWISE; // For clip

    post_ops->eltwise =
            bli_malloc_user(2 * sizeof(aocl_post_op_eltwise), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // Gelu tanh.
    (post_ops->eltwise + 0)->is_power_of_2 = FALSE;
    (post_ops->eltwise + 0)->scale_factor = NULL;
    (post_ops->eltwise + 0)->algo.alpha = NULL;
    (post_ops->eltwise + 0)->algo.beta = NULL;
    (post_ops->eltwise + 0)->algo.algo_type = GELU_TANH;

    // Clip.
    (post_ops->eltwise + 1)->is_power_of_2 = FALSE;
    (post_ops->eltwise + 1)->scale_factor = NULL;
    // Min bound is represented by alpha.
    (post_ops->eltwise + 1)->algo.alpha =
                            bli_malloc_user(sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    // Max bound is represented by beta.
    (post_ops->eltwise + 1)->algo.beta =
                            bli_malloc_user(sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    // Set some min/max bounds.
    *((float*)(post_ops->eltwise + 1)->algo.alpha) = -64.5;
    *((float*)(post_ops->eltwise + 1)->algo.beta) = 3.9;
    (post_ops->eltwise + 1)->algo.algo_type = CLIP;

    post_ops->seq_length = 3;

    // Reorder B matrix, this is pre-packing the B matrix so that packing
    // costs are not incurred when executing GEMM.
    siz_t b_reorder_buffer_size =
        aocl_get_reorder_buf_size_bf16bf16f32of32(storage, transb, 'B', k, n );
    bfloat16* b_reorder =
        (bfloat16*)bli_malloc_user(b_reorder_buffer_size, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    aocl_reorder_bf16bf16f32of32(storage, transb, 'B',
                                 b, b_reorder,
                                 k, n, ldb);

    aocl_gemm_bf16bf16f32of32(storage, transa, transb,
                              m, n, k,
                              alpha,
                              a, lda, reordera,
                              b_reorder, ldb, reorderb,
                              beta,
                              c, ldc,
                              post_ops);

bailout:
    if ((post_ops->eltwise + 1)->algo.alpha != NULL)
    {
        bli_free_user((post_ops->eltwise + 1)->algo.alpha);
    }
    if ((post_ops->eltwise + 1)->algo.beta != NULL)
    {
        bli_free_user((post_ops->eltwise + 1)->algo.beta);
    }
    if (post_ops->eltwise != NULL)
    {
        bli_free_user(post_ops->eltwise);
    }
    if (post_ops->bias != NULL)
    {
        if ((post_ops->bias + 0)->bias != NULL)
        {
            bli_free_user((post_ops->bias + 0)->bias);
        }
        bli_free_user(post_ops->bias);
    }
    if (post_ops->seq_vector != NULL)
    {
        bli_free_user(post_ops->seq_vector);
    }
    if (post_ops != NULL)
    {
        bli_free_user(post_ops);
    }
    if (b_reorder != NULL)
    {
        bli_free_user(b_reorder);
    }
    if (a != NULL)
    {
        bli_free_user(a);
    }
    if (b != NULL)
    {
        bli_free_user(b);
    }
    if (c != NULL)
    {
        bli_free_user(c);
    }

    return 0;
}