mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Tiny GEMM path for F32 LPGEMM API.
-Currently the F32 API uses the 5 loop algorithm inside the OMP loop to compute the results, irrespective if the input sizes. However it was observed that for very tiny sizes (n <= 128, m <= 36), this OMP loop and NC,MC,KC loops were turning out to be overheads. -In order to address this, a new path without OMP loop and just the NR loop over the micro-kernel is introduced for tiny inputs. This is only applied when the num threads set for GEMM is 1. AMD-Internal: [SWLCSG-3380] Change-Id: Ia712a0df19206b57efe4c97e9764d4b37ad7e275
This commit is contained in:
committed by
Nallani Bhaskar
parent
8abb37a0ad
commit
86e52783e4
@@ -1,3 +1,7 @@
|
||||
r n n n r 121 1 1601 1601 1 1 f32f32f32of32:bias=na,matrix_mul=na
|
||||
r n n n r 13 1 16 16 1 1 f32f32f32of32:bias=na,matrix_mul=na
|
||||
r n n n r 36 64 16 16 64 64 f32f32f32of32:none
|
||||
r n n n r 1 48 16 16 48 48 f32f32f32of32:bias=na,matrix_mul=na
|
||||
r t n n r 1 128 64 1 128 128 *:none
|
||||
c n t n n 32 128 2 32 128 32 bf16bf16f32of32:bias=na,swish
|
||||
r n n n r 6 1 4 4 16 16 bf16s4f32of32:pre_op_scale=scalar,pre_op_scale_type=bf16,group_size=2
|
||||
@@ -6,16 +10,16 @@ r n n n r 6 1 4 4 16 16 bf16s4f32of32:pre_op_zp=scalar,pre_op_scale=scalar,pre_o
|
||||
r n t n r 288 12 6460 6460 6460 12 bf16s4f32of32:none
|
||||
r n t n r 150 2048 6460 6460 6460 2048 bf16s4f32of32:none
|
||||
r n n n r 1 10 2050 2050 20 20 bf16bf16f32obf16:none
|
||||
r n n n r 482 690 2050 2050 690 690 f32f32f32of32:bias=na,matrix_mul
|
||||
r n n n r 253 2048 660 660 2048 2048 bf16bf16f32of32:matrix_mul,clip
|
||||
c n n n p 100 200 300 100 300 100 f32f32f32of32:matrix_mul,gelu_tanh
|
||||
c t n n n 16 256 512 512 512 256 bf16bf16f32of32:matrix_mul
|
||||
r n n n r 482 690 2050 2050 690 690 f32f32f32of32:bias=na,matrix_mul=na
|
||||
r n n n r 253 2048 660 660 2048 2048 bf16bf16f32of32:matrix_mul=na,clip
|
||||
c n n n p 100 200 300 100 300 100 f32f32f32of32:matrix_mul=na,gelu_tanh
|
||||
c t n n n 16 256 512 512 512 256 bf16bf16f32of32:matrix_mul=na
|
||||
r n n n n 160 6424 2051 2051 6424 6424 *:bias=na,swish
|
||||
r n n n r 74 512 515 515 512 512 *:none
|
||||
r n n n r 253 2048 660 660 2048 2048 *:matrix_add
|
||||
r n n n r 253 2048 660 660 2048 2048 *:matrix_add=na
|
||||
r n n n p 81 128 3 3 128 128 u8s8s32os32:bias=na,relu,clip
|
||||
r n n n p 81 128 3 3 128 128 u8s8s32os8:bias=na,relu,clip
|
||||
r n n n p 181 1280 3000 3000 1280 1280 *:bias=na,relu,clip,matrix_add
|
||||
r n n n p 181 1280 3000 3000 1280 1280 *:bias=na,relu,clip,matrix_add=na
|
||||
r n n n r 482 690 2050 2050 690 690 *:scale=scalar,zp=scalar,gelu_tanh,clip
|
||||
r n n n r 482 690 2050 2050 690 690 *:scale=vector,zp=vector,bias=na,gelu_erf,clip
|
||||
c n n n p 100 200 300 100 300 100 f32f32f32of32:bias=na,gelu_tanh,clip
|
||||
@@ -26,8 +30,8 @@ r n n n r 128 128 128 128 128 128 *:bias=na,relu,clip
|
||||
r n n n r 100 200 300 300 200 200 u8s8s16ou8:none
|
||||
c t n n n 16 256 512 512 512 256 bf16bf16f32of32:none
|
||||
r n n n r 144 6424 2090 2090 6424 6424 *:bias=na,swish
|
||||
c n n n n 160 6400 2051 160 2051 160 bf16bf16f32obf16:bias=f32,matrix_mul
|
||||
c n n n n 160 6400 2051 160 2051 160 bf16bf16f32of32:bias=bf16,matrix_add
|
||||
c n n n n 160 6400 2051 160 2051 160 bf16bf16f32obf16:bias=f32,matrix_mul=na
|
||||
c n n n n 160 6400 2051 160 2051 160 bf16bf16f32of32:bias=bf16,matrix_add=na
|
||||
r n n n n 160 6424 2051 2051 6424 6424 *:bias=na,swish
|
||||
r n n n r 74 512 515 515 512 512 *:none
|
||||
c n t n n 1 10 10 10 10 1 f32f32f32of32:none
|
||||
|
||||
Reference in New Issue
Block a user