4.7.11. Sample Application 3 - 5.2 English - 57404

AOCL User Guide (57404)

Document ID
57404
Release Date
2025-12-29
Version
5.2 English

The following sample application is to demonstrate usage of LPGEMM downscale API with multiple scale post-ops and int4 to int8 B matrix reordering:

/*
$gcc test_lpgemm.c -o ./test_lpgemm.x -I/aocl-blis_install_directory/include/amdzen/
-L/aocl-blis_install_directory/lib/amdzen/ -lblis-mt -lm

Note: Export blis library path to LD_LIBRARY_PATH before running the
executable ./test_lpgem.x
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "blis.h"

// aocl_gemm_u8s8s32os8 (A:uint8_t, B:int_t, C:int8_t) used here.
// 3 post-ops - scale + matrix_add + scale used here.
int main()
{
    dim_t m = 1024;
    dim_t n = 1024;
    dim_t k = 1024;

    // Leading dimensions for row major matrices.
    dim_t lda = k;
    dim_t ldb = n;
    dim_t ldc = n;

    err_t err = BLIS_SUCCESS;
    uint8_t *a = (uint8_t *)bli_malloc_user(sizeof(uint8_t) * m * k, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // int4_t B matrix represented using int8_t, but with half the int8_t size.
    int8_t *b = (int8_t *)bli_malloc_user((sizeof(int8_t) * n * k) / 2, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    int8_t *c = (int8_t *)bli_malloc_user(sizeof(int8_t) * m * n, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // Functions to fill the matrices with data can be added here.
    int32_t alpha = 2;
    int32_t beta = 9;
    char storage = 'r'; // Row major. Use 'c' for column major.
    char transa = 'n'; // No transpose. Transpose not supported.
    char transb = 'n';
    char reordera = 'n';
    char reorderb = 'r';

    // Initialize post-ops struct.
    aocl_post_op *post_ops = NULL;
    post_ops = (aocl_post_op *)bli_malloc_user(sizeof(aocl_post_op), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // Downscale parameters need to be passed as a post-op, even
    // if a downscale specific api is invoked.
    dim_t max_post_ops_seq_length = 3; // scale + matrix_add + scale

    post_ops->seq_vector =
        (AOCL_POST_OP_TYPE *) bli_malloc_user(
                        max_post_ops_seq_length * sizeof(AOCL_POST_OP_TYPE),
                        &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // 2 scaling post-ops, first for normal scaling and second one for
    // downscaling, need to allocate scale struct dynamically.
    post_ops->sum =
            bli_malloc_user(2 * sizeof(aocl_post_op_sum), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }

    // For first scale, using scalar zero point and scale factor.
    post_ops->seq_vector[0] = SCALE;
    (post_ops->sum + 0)->is_power_of_2 = FALSE;
    (post_ops->sum + 0)->buff = NULL;
    (post_ops->sum + 0)->zero_point =
         bli_malloc_user(1 * sizeof(int8_t), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    *((int8_t*)((post_ops->sum + 0)->zero_point)) = 3;
    (post_ops->sum + 0)->zero_point_len = 1;
    (post_ops->sum + 0)->zp_stor_type = AOCL_GEMM_INT8;
    (post_ops->sum + 0)->scale_factor =
         bli_malloc_user(1 * sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    *((float*)((post_ops->sum + 0)->scale_factor)) = 3.9;
    (post_ops->sum + 0)->scale_factor_len = 1;

    // Matrix add post-op.
    post_ops->matrix_add =
            bli_malloc_user(1 * sizeof(aocl_post_op_matrix_add), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    post_ops->seq_vector[1] = MATRIX_ADD;
    (post_ops->matrix_add + 0)->matrix =
         bli_malloc_user(sizeof(int8_t) * m * n, &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    (post_ops->matrix_add + 0)->ldm = n;
    (post_ops->matrix_add + 0)->stor_type = AOCL_GEMM_INT8;
    (post_ops->matrix_add + 0)->scale_factor =
            bli_malloc_user(1 * sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    *((float*)((post_ops->matrix_add + 0)->scale_factor)) = 1.9;
    (post_ops->matrix_add + 0)->scale_factor_len = 1;

    // Add function to fill matrix_add matrix here.

    // For second scale, using vector zero point and scale factor.
    // This scale post-op is purely for downscaling/quantization.
    post_ops->seq_vector[2] = SCALE;
    (post_ops->sum + 1)->is_power_of_2 = FALSE;
    (post_ops->sum + 1)->buff = NULL;
    (post_ops->sum + 1)->zero_point =
         bli_malloc_user(n * sizeof(int8_t), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    (post_ops->sum + 1)->zero_point_len = n;
    (post_ops->sum + 1)->zp_stor_type = AOCL_GEMM_INT8;
    (post_ops->sum + 1)->scale_factor =
         bli_malloc_user(n * sizeof(float), &err);
    if (err != BLIS_SUCCESS) { goto bailout; }
    (post_ops->sum + 1)->scale_factor_len = n;
    // Add function to fill zero point and scale factor here.

    post_ops->seq_length = 3;

    // Reorder B matrix, this is pre-packing the B matrix so that packing
    // costs are not incurred when executing GEMM. Here the int4 B matrix
    // is reordered along with conversion to each element to int8 type.
    siz_t b_reorder_buffer_size =
      aocl_get_reorder_buf_size_u8s4s32os32(storage, transb, 'B', k, n );

    int8_t* b_reorder = (int8_t*)bli_malloc_user(b_reorder_buffer_size, &err);

    aocl_reorder_u8s4s32os32(storage, transb, 'B',
                             b, b_reorder,
                             k, n, ldb);

    aocl_gemm_u8s8s32os8(storage, transa, transb,
                         m, n, k,
                         alpha,
                         a, lda, reordera,
                         b_reorder, ldb, reorderb,
                         beta,
                         c, ldc,
                         post_ops);

bailout:
    if (post_ops->sum != NULL)
    {
      if ((post_ops->sum + 0)->zero_point != NULL)
      {
         bli_free_user((post_ops->sum + 0)->zero_point);
      }
      if ((post_ops->sum + 0)->scale_factor != NULL)
      {
         bli_free_user((post_ops->sum + 0)->scale_factor);
      }
      if ((post_ops->sum + 1)->zero_point != NULL)
      {
         bli_free_user((post_ops->sum + 1)->zero_point);
      }
      if ((post_ops->sum + 1)->scale_factor != NULL)
      {
         bli_free_user((post_ops->sum + 1)->scale_factor);
      }

      bli_free_user(post_ops->sum);
    }
    if (post_ops->matrix_add != NULL)
    {
      if ((post_ops->matrix_add + 0)->matrix != NULL)
      {
         bli_free_user((post_ops->matrix_add + 0)->matrix);
      }
      if ((post_ops->matrix_add + 0)->scale_factor != NULL)
      {
         bli_free_user((post_ops->matrix_add + 0)->scale_factor);
      }

      bli_free_user(post_ops->matrix_add);
    }
    if (post_ops->seq_vector != NULL)
    {
        bli_free_user(post_ops->seq_vector);
    }
    if (post_ops != NULL)
    {
        bli_free_user(post_ops);
    }
    if (b_reorder != NULL)
    {
        bli_free_user(b_reorder);
    }
    if (a != NULL)
    {
        bli_free_user(a);
    }
    if (b != NULL)
    {
        bli_free_user(b);
    }
    if (c != NULL)
    {
        bli_free_user(c);
    }

    return 0;
}