From 11570dbc1442c8b10ca51f8bc98fda91a761b6ee Mon Sep 17 00:00:00 2001 From: managalv Date: Fri, 22 May 2020 10:47:28 +0530 Subject: [PATCH] CPUPL-929:Improve Complex GEMM performance Updated BLIS_MC value and created SUP context for CCC storage format Change-Id: I5032b29834ea545d7b5f7a9469bc5655c71b7fe5 --- config/zen/bli_cntx_init_zen.c | 8 +++++--- config/zen2/bli_cntx_init_zen2.c | 24 +++++++++++++----------- frame/3/bli_l3_sup.c | 6 ++++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 90431e5c3..672bb4a1e 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -131,7 +131,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) a) If BLIS is run in a multi-instance mode with CPU freq 2.6/2.2 Ghz DDR4 clock frequency 2400Mhz - mc = 240, kc = 512, and nc = 2040 + mc = 240, kc = 512, and nc = 2040 has better performance on EPYC server, over the default block sizes. b) If BLIS is run in Single Instance mode @@ -219,7 +219,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( - 20, + 22, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, @@ -238,9 +238,11 @@ void bli_cntx_init_zen( cntx_t* cntx ) BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, cntx ); @@ -250,7 +252,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, 9, 9, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index 78e573f7e..83e2958c4 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -92,7 +92,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, - // dotxv + // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, @@ -109,9 +109,9 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + cntx ); // Initialize level-3 blocksize objects with architecture-specific values. @@ -149,13 +149,13 @@ void bli_cntx_init_zen2( cntx_t* cntx ) ); // ------------------------------------------------------------------------- - //Initialize TRSM blocksize objects with architecture-specific values. + //Initialize TRSM blocksize objects with architecture-specific values. //Using different cache block sizes for TRSM instead of common level-3 block sizes. //Tuning is done for double-precision only. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); // Update the context with the current architecture's register and cache // blocksizes for level-3 TRSM problems. @@ -196,7 +196,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( - 20, + 22, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, @@ -215,9 +215,11 @@ void bli_cntx_init_zen2( cntx_t* cntx ) BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, cntx ); @@ -227,7 +229,7 @@ void bli_cntx_init_zen2( cntx_t* cntx ) bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, 9, 9, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 36, 18 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c index 758734a5a..885472409 100644 --- a/frame/3/bli_l3_sup.c +++ b/frame/3/bli_l3_sup.c @@ -75,7 +75,8 @@ err_t bli_gemmsup //Don't use sup for currently unsupported storage types and dimension in cgemmsup if(bli_obj_is_scomplex(c) && ((!((stor_id == BLIS_RRR) || (stor_id == BLIS_CRR) - ||(stor_id == BLIS_CCR) || (stor_id == BLIS_RCR))) + ||(stor_id == BLIS_CCR) || (stor_id == BLIS_RCR) + ||(stor_id == BLIS_CCC))) || ((m/3) < (n/8)) || (!((transa == BLIS_NO_TRANSPOSE)&&(transb == BLIS_NO_TRANSPOSE))) )){ @@ -86,7 +87,8 @@ err_t bli_gemmsup //Don't use sup for currently unsupported storage types and dimension in zgemmsup if(bli_obj_is_dcomplex(c) && ((!((stor_id == BLIS_RRR) || (stor_id == BLIS_CRR) - ||(stor_id == BLIS_CCR) || (stor_id == BLIS_RCR))) + ||(stor_id == BLIS_CCR) || (stor_id == BLIS_RCR) + ||(stor_id == BLIS_CCC))) || ((m/3) < (n/4)) || (!((transa == BLIS_NO_TRANSPOSE)&&(transb == BLIS_NO_TRANSPOSE))) )){