mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Additional optimizations to ZGEMM SUP and Tiny codepaths(ZEN4 and ZEN5)
- Added a set of AVX512 fringe kernels(using masked loads and stores) in order to avoid rerouting to the GEMV typed API interface(when m = 1). This ensures uniformity in performance across the main and fringe cases, when the calls are multithreaded. - Further tuned the thresholds to decide between ZGEMM Tiny, Small SUP and Native paths for ZEN4 and ZEN5 architectures(in case of parallel execution). This would account for additional combinations of the input dimensions. - Moved the call to Tiny-ZGEMM before the BLIS object creation, since this code-path operates on raw buffers. - Added the necessary test-cases for functional and memory testing of the newly added kernels. AMD-Internal: [CPUPL-6378][CPUPL-6661] Change-Id: I9af73d1b6ef82b26503d4fc373111132aee3afd6
This commit is contained in:
committed by
Vignesh Balasubramanian
parent
87c9230cac
commit
b4b0887ca4
File diff suppressed because it is too large
Load Diff
@@ -86,8 +86,10 @@ bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t*
|
||||
// For skinny sizes where m and/or n is small
|
||||
// The threshold for m is a single value, but for n, it is
|
||||
// also based on the packing size of A, since the kernels are
|
||||
// column preferential
|
||||
if( ( m <= 84 ) || ( ( n <= 84 ) && ( m < 4000 ) ) ) return TRUE;
|
||||
// column preferential
|
||||
if( ( ( m <= 120 ) && ( n <= 7515 ) && ( k <= 128 ) ) ||
|
||||
// ( ( m <= 96 ) && ( n <= 7515 ) && ( k <= 128 ) ) ||
|
||||
( ( m <= 1200 ) && ( n <= 1200 ) && ( k <= 64 ) ) ) return TRUE;
|
||||
|
||||
// For all combinations in small sizes
|
||||
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
|
||||
|
||||
@@ -361,6 +361,11 @@ GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_8x1 )
|
||||
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_fx4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_fx3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_fx2 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_fx1 )
|
||||
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x4 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x3 )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_4x2 )
|
||||
|
||||
@@ -87,7 +87,7 @@ bool bli_cntx_gemmsup_thresh_is_met_zen5( obj_t* a, obj_t* b, obj_t* c, cntx_t*
|
||||
// The threshold for m is a single value, but for n, it is
|
||||
// also based on the packing size of A, since the kernels are
|
||||
// column preferential
|
||||
if( ( m <= 84 ) || ( ( n <= 84 ) && ( ( m * k ) <= 983040 ) ) ) return TRUE;
|
||||
if( ( m <= 60 ) || ( ( n <= 60 ) && ( m <= 960 ) && ( k <= 16384 ) ) || ( k <= 8 ) ) return TRUE;
|
||||
|
||||
// For all combinations in small sizes
|
||||
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
|
||||
|
||||
Reference in New Issue
Block a user