mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
CPUPL-709: Improve Complex GEMM performance - Level 1 Optimization
Details Added SUP support for cgemm in M direction SUP kernels are 3x8m, 3x4m, 3x2m is implemeted Sub kernels are implemented to support various dimenions SUP CGEMM supports matrix C & A row/col major and Matrix B is row major matrix Change-Id: Ia6854b929d3b5741a4900422d05df1257f5d014d
This commit is contained in:
@@ -194,9 +194,9 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
@@ -219,7 +219,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
14,
|
||||
17,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
@@ -235,18 +235,21 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
||||
9, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
||||
9, 9, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
|
||||
@@ -171,9 +171,9 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
);
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values. s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 );
|
||||
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 );
|
||||
|
||||
// Initialize the context with the sup thresholds.
|
||||
bli_cntx_set_l3_sup_thresh
|
||||
@@ -196,7 +196,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
14,
|
||||
17,
|
||||
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
|
||||
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
|
||||
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
|
||||
@@ -211,19 +211,22 @@ void bli_cntx_init_zen2( cntx_t* cntx )
|
||||
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
|
||||
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
|
||||
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 sup blocksize objects with architecture-specific
|
||||
// values.
|
||||
// s d c z
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
|
||||
9, 9, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
|
||||
9, 9, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes for small/unpacked level-3 problems.
|
||||
|
||||
@@ -67,6 +67,32 @@ err_t bli_gemmsup
|
||||
if(BLIS_XXX==stor_id)
|
||||
return BLIS_FAILURE;
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
trans_t transa = bli_obj_conjtrans_status( a );
|
||||
trans_t transb = bli_obj_conjtrans_status( b );
|
||||
|
||||
if(bli_obj_is_scomplex(c) && (((m/3) < (n/8)))){
|
||||
//printf(" gemmsup: Returning with for un-supported dimension in cgemmsup \n");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
//Don't use sup for currently unsupported storage types in cgemmsup
|
||||
if(bli_obj_is_scomplex(c) && (!((stor_id == BLIS_RRR) || (stor_id == BLIS_CRR) || (stor_id == BLIS_CCR) || (stor_id == BLIS_RCR)))){
|
||||
printf(" gemmsup: Returning with for un-supported storage types in cgemmsup \n");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
if(bli_obj_is_scomplex(c) && (!((transa == BLIS_NO_TRANSPOSE)&&(transb == BLIS_NO_TRANSPOSE)))){
|
||||
//printf(" gemmsup: Returning with for un-supported matrix property in cgemmsup \n");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
if(bli_obj_is_dcomplex(c)){
|
||||
//printf(" gemmsup: Returning with for zgemmsup \n");
|
||||
return BLIS_FAILURE;
|
||||
}
|
||||
|
||||
// Obtain a valid (native) context from the gks if necessary.
|
||||
// NOTE: This must be done before calling the _check() function, since
|
||||
// that function assumes the context pointer is valid.
|
||||
@@ -78,8 +104,6 @@ err_t bli_gemmsup
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
// Pass in m and n reversed, which simulates a transposition of the
|
||||
@@ -90,8 +114,6 @@ err_t bli_gemmsup
|
||||
else // ukr_prefers_storage_of( c, ... )
|
||||
{
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t k = bli_obj_width_after_trans( a );
|
||||
|
||||
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) )
|
||||
|
||||
2339
kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c
Normal file
2339
kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c
Normal file
File diff suppressed because it is too large
Load Diff
1791
kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c
Normal file
1791
kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -133,6 +133,15 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m )
|
||||
GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 )
|
||||
GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 )
|
||||
|
||||
// gemmsup_rv (mkernel in n dim)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user