mirror of
https://github.com/amd/blis.git
synced 2026-05-05 15:01:13 +00:00
Tiny GEMM path for F32 LPGEMM API.
-Currently the F32 API uses the 5 loop algorithm inside the OMP loop to compute the results, irrespective if the input sizes. However it was observed that for very tiny sizes (n <= 128, m <= 36), this OMP loop and NC,MC,KC loops were turning out to be overheads. -In order to address this, a new path without OMP loop and just the NR loop over the micro-kernel is introduced for tiny inputs. This is only applied when the num threads set for GEMM is 1. AMD-Internal: [SWLCSG-3380] Change-Id: Ia712a0df19206b57efe4c97e9764d4b37ad7e275
This commit is contained in:
committed by
Nallani Bhaskar
parent
8abb37a0ad
commit
86e52783e4
@@ -39,6 +39,32 @@
|
||||
#include "lpgemm_post_ops.h"
|
||||
#include "aocl_bf16_type.h"
|
||||
|
||||
#define LPGEMM_TINY(A_type,B_type,C_type,LP_SFX) \
|
||||
void lpgemm_rowvar_tiny_ ## LP_SFX \
|
||||
( \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
const dim_t k, \
|
||||
const A_type* a, \
|
||||
const dim_t rs_a, \
|
||||
const dim_t cs_a, \
|
||||
const AOCL_MEMORY_TAG mtag_a, \
|
||||
const B_type* b, \
|
||||
const dim_t rs_b, \
|
||||
const dim_t cs_b, \
|
||||
const AOCL_MEMORY_TAG mtag_b, \
|
||||
C_type* c, \
|
||||
const dim_t rs_c, \
|
||||
const dim_t cs_c, \
|
||||
const C_type alpha, \
|
||||
const C_type beta, \
|
||||
lpgemm_cntx_t* lcntx, \
|
||||
lpgemm_post_op* post_op_list, \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
|
||||
LPGEMM_TINY(float,float,float,f32f32f32of32);
|
||||
|
||||
#define LPGEMM_5LOOP(A_type,B_type,C_type,LP_SFX) \
|
||||
void lpgemm_rowvar_ ## LP_SFX \
|
||||
( \
|
||||
@@ -101,6 +127,32 @@ void lpgemm_rowvar_ ## LP_SFX \
|
||||
|
||||
LPGEMM_5LOOP1(bfloat16,int8_t,float,bf16s4f32of32);
|
||||
|
||||
#define LPGEMV_TINY(A_type, B_type, C_type, LP_SFX) \
|
||||
void lpgemv_rowvar_tiny_ ## LP_SFX \
|
||||
( \
|
||||
const dim_t m, \
|
||||
const dim_t n, \
|
||||
const dim_t k, \
|
||||
const A_type *a, \
|
||||
const dim_t rs_a, \
|
||||
const dim_t cs_a, \
|
||||
const AOCL_MEMORY_TAG mtag_a, \
|
||||
const B_type *b, \
|
||||
const dim_t rs_b, \
|
||||
const dim_t cs_b, \
|
||||
const AOCL_MEMORY_TAG mtag_b, \
|
||||
C_type *c, \
|
||||
const dim_t rs_c, \
|
||||
const dim_t cs_c, \
|
||||
const C_type alpha, \
|
||||
const C_type beta, \
|
||||
lpgemm_cntx_t *lcntx, \
|
||||
lpgemm_post_op *post_op_list, \
|
||||
AOCL_STORAGE_TYPE c_downscale \
|
||||
) \
|
||||
|
||||
LPGEMV_TINY(float, float, float, f32f32f32of32);
|
||||
|
||||
#define LPGEMV(A_type, B_type, C_type, LP_SFX) \
|
||||
void lpgemv_rowvar_ ## LP_SFX \
|
||||
( \
|
||||
|
||||
Reference in New Issue
Block a user