Tiny GEMM path for F32 LPGEMM API.

-Currently the F32 API uses the 5 loop algorithm inside the OMP loop to compute the results, irrespective if the input sizes. However it was observed that for very tiny sizes (n <= 128, m <= 36), this OMP loop and NC,MC,KC loops were turning out to be overheads. -In order to address this, a new path without OMP loop and just the NR loop over the micro-kernel is introduced for tiny inputs. This is only applied when the num threads set for GEMM is 1. AMD-Internal: [SWLCSG-3380] Change-Id: Ia712a0df19206b57efe4c97e9764d4b37ad7e275
2026-05-05 15:01:13 +00:00 · 2025-02-05 05:13:11 +00:00
parent 8abb37a0ad
commit 86e52783e4
5 changed files with 408 additions and 8 deletions
--- a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
+++ b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
@@ -39,6 +39,32 @@
 #include "lpgemm_post_ops.h"
 #include "aocl_bf16_type.h"

+#define LPGEMM_TINY(A_type,B_type,C_type,LP_SFX) \
+void lpgemm_rowvar_tiny_ ## LP_SFX \
+     ( \
+       const dim_t           m, \
+       const dim_t           n, \
+       const dim_t           k, \
+       const A_type*         a, \
+       const dim_t           rs_a, \
+       const dim_t           cs_a, \
+       const AOCL_MEMORY_TAG mtag_a, \
+       const B_type*         b, \
+       const dim_t           rs_b, \
+       const dim_t           cs_b, \
+       const AOCL_MEMORY_TAG mtag_b, \
+       C_type*               c, \
+       const dim_t           rs_c, \
+       const dim_t           cs_c, \
+       const C_type          alpha, \
+       const C_type          beta, \
+       lpgemm_cntx_t*        lcntx, \
+       lpgemm_post_op*       post_op_list, \
+       AOCL_STORAGE_TYPE     c_downscale \
+     ) \
+
+LPGEMM_TINY(float,float,float,f32f32f32of32);
+
 #define LPGEMM_5LOOP(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
     ( \
@@ -101,6 +127,32 @@ void lpgemm_rowvar_ ## LP_SFX \

 LPGEMM_5LOOP1(bfloat16,int8_t,float,bf16s4f32of32);

+#define LPGEMV_TINY(A_type, B_type, C_type, LP_SFX) \
+void lpgemv_rowvar_tiny_ ## LP_SFX \
+    ( \
+      const dim_t           m, \
+      const dim_t           n, \
+      const dim_t           k, \
+      const A_type          *a, \
+      const dim_t           rs_a, \
+      const dim_t           cs_a, \
+      const AOCL_MEMORY_TAG mtag_a, \
+      const B_type          *b, \
+      const dim_t           rs_b, \
+      const dim_t           cs_b, \
+      const AOCL_MEMORY_TAG mtag_b, \
+      C_type                *c, \
+      const dim_t           rs_c, \
+      const dim_t           cs_c, \
+      const C_type          alpha, \
+      const C_type          beta, \
+      lpgemm_cntx_t         *lcntx, \
+      lpgemm_post_op        *post_op_list, \
+      AOCL_STORAGE_TYPE      c_downscale \
+    ) \
+
+LPGEMV_TINY(float, float, float, f32f32f32of32);
+
 #define LPGEMV(A_type, B_type, C_type, LP_SFX) \
 void lpgemv_rowvar_ ## LP_SFX \
    ( \