AVX512 optimizations for CGEMM(SUP)

- Implemented the following AVX512 SUP
  column-preferential kernels(m-variant) for CGEMM :
  Main kernel    : 24x4m
  Fringe kernels : 24x3m, 24x2m, 24x1m,
                   16x4, 16x3, 16x2, 16x1,
                   8x4, 8x3, 8x2, 8x1,
                   fx4, fx3, fx2, fx1(where 0<f<8).

- Utlized the packing kernel to pack A when
  handling inputs with CRC storage scheme. This
  would in turn handle RRC with operation transpose
  in the framework layer.

- Further adding C prefetching to the main kernel,
  and updated the cache-blocking parameters for
  ZEN4 and ZEN5 contexts.

- Added a set of decision logics to choose between
  SUP and Native AVX512 code-paths for ZEN4 and ZEN5
  architectures.

- Updated the testing interface for complex GEMMSUP
  to accept the kernel dimension(MR) as a parameter, in
  order to set the appropriate panel stride for functional
  and memory testing. Also updated the existing instantiators
  to send their kernel dimensions as a parameter.

- Added unit tests for functional and memory testing of these
  newly added kernels.

AMD-Internal: [CPUPL-6498]

Change-Id: Ie79d3d0dc7eed7edf30d8d4f74b888135f31d6b4
This commit is contained in:
Vignesh Balasubramanian
2025-03-03 19:04:03 +05:30
committed by Vignesh Balasubramanian
parent 8998839c71
commit 07df9f471e
11 changed files with 9046 additions and 321 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -93,6 +93,48 @@ bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t*
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
return FALSE;
}
else if( dt == BLIS_SCOMPLEX )
{
dim_t k = bli_obj_width_after_trans( a );
dim_t m, n;
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
{
m = bli_obj_width(c);
n = bli_obj_length(c);
}
else
{
m = bli_obj_length( c );
n = bli_obj_width( c );
}
// The threshold conditionals are as follows:
if( n <= 540 )
{
if( n <= 420 ) return TRUE;
else if( m <= 1260 ) return TRUE;
}
else
{
if( m <= 420 )
{
if( m <= 180 ) return TRUE;
else if( n <= 2100 ) return TRUE;
}
else
{
if( k <= 540 )
{
if( n <= 1260 ) return TRUE;
else if( m <= 900 ) return TRUE;
}
}
}
return FALSE;
}
else
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
}

View File

@@ -332,6 +332,24 @@ GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_24x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_16x1)
GEMMSUP_KER_PROT( double, d, gemmsup_rv_zen4_asm_8x1)
// Cgemm sup CV kernels
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x4m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x3m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x2m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_24x1m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x3 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x2 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_16x1 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x3 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x2 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_8x1 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx3 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx2 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_cv_zen4_asm_fx1 )
// Zgemm sup CV kernels
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x4m )
GEMMSUP_KER_PROT( dcomplex, z, gemmsup_cv_zen4_asm_12x3m )

View File

@@ -93,6 +93,48 @@ bool bli_cntx_gemmsup_thresh_is_met_zen5( obj_t* a, obj_t* b, obj_t* c, cntx_t*
if( ( m <= 216 ) && ( n <= 216 ) && ( k <= 216 ) ) return TRUE;
return FALSE;
}
else if( dt == BLIS_SCOMPLEX )
{
dim_t k = bli_obj_width_after_trans( a );
dim_t m, n;
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
{
m = bli_obj_width(c);
n = bli_obj_length(c);
}
else
{
m = bli_obj_length( c );
n = bli_obj_width( c );
}
// The threshold conditionals are as follows:
if( n <= 540 )
{
if( n <= 420 ) return TRUE;
else if( m <= 1260 ) return TRUE;
}
else
{
if( m <= 420 )
{
if( m <= 180 ) return TRUE;
else if( n <= 2100 ) return TRUE;
}
else
{
if( k <= 540 )
{
if( n <= 1260 ) return TRUE;
else if( m <= 900 ) return TRUE;
}
}
}
return FALSE;
}
else
return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
}