CPUPL-709: Improve Complex GEMM performance - Level 1 Optimization

Details
Added SUP support for cgemm in M direction
SUP kernels are 3x8m, 3x4m, 3x2m is implemeted
Sub kernels are implemented to support various dimenions
SUP CGEMM supports matrix C & A row/col major and Matrix B is row major matrix

Change-Id: Ia6854b929d3b5741a4900422d05df1257f5d014d
This commit is contained in:
managalv
2020-05-16 06:05:21 +05:30
parent b3a308b689
commit 310dda928f
6 changed files with 4192 additions and 25 deletions

View File

@@ -194,9 +194,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
@@ -219,7 +219,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
14,
17,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
@@ -235,18 +235,21 @@ void bli_cntx_init_zen( cntx_t* cntx )
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.

View File

@@ -171,9 +171,9 @@ void bli_cntx_init_zen2( cntx_t* cntx )
);
// Initialize sup thresholds with architecture-appropriate values. s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
@@ -196,7 +196,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
14,
17,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
@@ -211,19 +211,22 @@ void bli_cntx_init_zen2( cntx_t* cntx )
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.

View File

@@ -67,6 +67,32 @@ err_t bli_gemmsup
if(BLIS_XXX==stor_id)
return BLIS_FAILURE;
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
trans_t transa = bli_obj_conjtrans_status( a );
trans_t transb = bli_obj_conjtrans_status( b );
if(bli_obj_is_scomplex(c) && (((m/3) < (n/8)))){
//printf(" gemmsup: Returning with for un-supported dimension in cgemmsup \n");
return BLIS_FAILURE;
}
//Don't use sup for currently unsupported storage types in cgemmsup
if(bli_obj_is_scomplex(c) && (!((stor_id == BLIS_RRR) || (stor_id == BLIS_CRR) || (stor_id == BLIS_CCR) || (stor_id == BLIS_RCR)))){
printf(" gemmsup: Returning with for un-supported storage types in cgemmsup \n");
return BLIS_FAILURE;
}
if(bli_obj_is_scomplex(c) && (!((transa == BLIS_NO_TRANSPOSE)&&(transb == BLIS_NO_TRANSPOSE)))){
//printf(" gemmsup: Returning with for un-supported matrix property in cgemmsup \n");
return BLIS_FAILURE;
}
if(bli_obj_is_dcomplex(c)){
//printf(" gemmsup: Returning with for zgemmsup \n");
return BLIS_FAILURE;
}
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
@@ -78,8 +104,6 @@ err_t bli_gemmsup
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
// Pass in m and n reversed, which simulates a transposition of the
@@ -90,8 +114,6 @@ err_t bli_gemmsup
else // ukr_prefers_storage_of( c, ... )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) )

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -133,6 +133,15 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m )
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 )
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 )
// gemmsup_rv (mkernel in n dim)