Tiny GEMM path for F32 LPGEMM API.

-Currently the F32 API uses the 5 loop algorithm inside the OMP loop
to compute the results, irrespective if the input sizes. However it
was observed that for very tiny sizes (n <= 128, m <= 36), this OMP
loop and NC,MC,KC loops were turning out to be overheads.
-In order to address this, a new path without OMP loop and just the
NR loop over the micro-kernel is introduced for tiny inputs. This is
only applied when the num threads set for GEMM is 1.

AMD-Internal: [SWLCSG-3380]

Change-Id: Ia712a0df19206b57efe4c97e9764d4b37ad7e275
This commit is contained in:
Deepak Negi
2025-02-05 05:13:11 +00:00
committed by Nallani Bhaskar
parent 8abb37a0ad
commit 86e52783e4
5 changed files with 408 additions and 8 deletions

View File

@@ -39,6 +39,32 @@
#include "lpgemm_post_ops.h"
#include "aocl_bf16_type.h"
#define LPGEMM_TINY(A_type,B_type,C_type,LP_SFX) \
void lpgemm_rowvar_tiny_ ## LP_SFX \
( \
const dim_t m, \
const dim_t n, \
const dim_t k, \
const A_type* a, \
const dim_t rs_a, \
const dim_t cs_a, \
const AOCL_MEMORY_TAG mtag_a, \
const B_type* b, \
const dim_t rs_b, \
const dim_t cs_b, \
const AOCL_MEMORY_TAG mtag_b, \
C_type* c, \
const dim_t rs_c, \
const dim_t cs_c, \
const C_type alpha, \
const C_type beta, \
lpgemm_cntx_t* lcntx, \
lpgemm_post_op* post_op_list, \
AOCL_STORAGE_TYPE c_downscale \
) \
LPGEMM_TINY(float,float,float,f32f32f32of32);
#define LPGEMM_5LOOP(A_type,B_type,C_type,LP_SFX) \
void lpgemm_rowvar_ ## LP_SFX \
( \
@@ -101,6 +127,32 @@ void lpgemm_rowvar_ ## LP_SFX \
LPGEMM_5LOOP1(bfloat16,int8_t,float,bf16s4f32of32);
#define LPGEMV_TINY(A_type, B_type, C_type, LP_SFX) \
void lpgemv_rowvar_tiny_ ## LP_SFX \
( \
const dim_t m, \
const dim_t n, \
const dim_t k, \
const A_type *a, \
const dim_t rs_a, \
const dim_t cs_a, \
const AOCL_MEMORY_TAG mtag_a, \
const B_type *b, \
const dim_t rs_b, \
const dim_t cs_b, \
const AOCL_MEMORY_TAG mtag_b, \
C_type *c, \
const dim_t rs_c, \
const dim_t cs_c, \
const C_type alpha, \
const C_type beta, \
lpgemm_cntx_t *lcntx, \
lpgemm_post_op *post_op_list, \
AOCL_STORAGE_TYPE c_downscale \
) \
LPGEMV_TINY(float, float, float, f32f32f32of32);
#define LPGEMV(A_type, B_type, C_type, LP_SFX) \
void lpgemv_rowvar_ ## LP_SFX \
( \