mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
AVX512 optimizations for CGEMM(SUP)
- Implemented the following AVX512 SUP
column-preferential kernels(m-variant) for CGEMM :
Main kernel : 24x4m
Fringe kernels : 24x3m, 24x2m, 24x1m,
16x4, 16x3, 16x2, 16x1,
8x4, 8x3, 8x2, 8x1,
fx4, fx3, fx2, fx1(where 0<f<8).
- Utlized the packing kernel to pack A when
handling inputs with CRC storage scheme. This
would in turn handle RRC with operation transpose
in the framework layer.
- Further adding C prefetching to the main kernel,
and updated the cache-blocking parameters for
ZEN4 and ZEN5 contexts.
- Added a set of decision logics to choose between
SUP and Native AVX512 code-paths for ZEN4 and ZEN5
architectures.
- Updated the testing interface for complex GEMMSUP
to accept the kernel dimension(MR) as a parameter, in
order to set the appropriate panel stride for functional
and memory testing. Also updated the existing instantiators
to send their kernel dimensions as a parameter.
- Added unit tests for functional and memory testing of these
newly added kernels.
AMD-Internal: [CPUPL-6498]
Change-Id: Ie79d3d0dc7eed7edf30d8d4f74b888135f31d6b4
This commit is contained in:
committed by
Vignesh Balasubramanian
parent
8998839c71
commit
07df9f471e
7808
kernels/zen4/3/sup/bli_gemmsup_cv_zen4_asm_c24x4m.c
Normal file
7808
kernels/zen4/3/sup/bli_gemmsup_cv_zen4_asm_c24x4m.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -93,6 +93,48 @@ bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t*
|
||||
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
else if( dt == BLIS_SCOMPLEX )
|
||||
{
|
||||
dim_t k = bli_obj_width_after_trans( a );
|
||||
dim_t m, n;
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
|
||||
{
|
||||
m = bli_obj_width(c);
|
||||
n = bli_obj_length(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
m = bli_obj_length( c );
|
||||
n = bli_obj_width( c );
|
||||
}
|
||||
|
||||
// The threshold conditionals are as follows:
|
||||
if( n <= 540 )
|
||||
{
|
||||
if( n <= 420 ) return TRUE;
|
||||
else if( m <= 1260 ) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( m <= 420 )
|
||||
{
|
||||
if( m <= 180 ) return TRUE;
|
||||
else if( n <= 2100 ) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( k <= 540 )
|
||||
{
|
||||
if( n <= 1260 ) return TRUE;
|
||||
else if( m <= 900 ) return TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
else
|
||||
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
|
||||
}
|
||||
|
||||
@@ -332,6 +332,24 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
|
||||
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)
|
||||
|
||||
// Cgemm sup CV kernels
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x4m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x3m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x2m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x1m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x3 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x2 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x1 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x3 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x2 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x1 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx3 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx2 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx1 )
|
||||
|
||||
// Zgemm sup CV kernels
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x4m )
|
||||
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x3m )
|
||||
|
||||
@@ -93,6 +93,48 @@ bool bli_cntx_gemmsup_thresh_is_met_zen5( obj_t* a, obj_t* b, obj_t* c, cntx_t*
|
||||
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
else if( dt == BLIS_SCOMPLEX )
|
||||
{
|
||||
dim_t k = bli_obj_width_after_trans( a );
|
||||
dim_t m, n;
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
|
||||
{
|
||||
m = bli_obj_width(c);
|
||||
n = bli_obj_length(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
m = bli_obj_length( c );
|
||||
n = bli_obj_width( c );
|
||||
}
|
||||
|
||||
// The threshold conditionals are as follows:
|
||||
if( n <= 540 )
|
||||
{
|
||||
if( n <= 420 ) return TRUE;
|
||||
else if( m <= 1260 ) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( m <= 420 )
|
||||
{
|
||||
if( m <= 180 ) return TRUE;
|
||||
else if( n <= 2100 ) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if( k <= 540 )
|
||||
{
|
||||
if( n <= 1260 ) return TRUE;
|
||||
else if( m <= 900 ) return TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
else
|
||||
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user