diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index 5e2b83b8a..26f9c62ad 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -39,245 +39,248 @@ void bli_cntx_init_zen( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; - // Set default kernel blocksizes and functions. - bli_cntx_init_zen_ref( cntx ); + // Set default kernel blocksizes and functions. + bli_cntx_init_zen_ref( cntx ); - // ------------------------------------------------------------------------- + // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs - ( - 8, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, - // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, - // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 8, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx + ); - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, - // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 4, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx + ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 18, + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 20, #if 1 - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif - // axpyv + // axpyv #if 0 - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, -#endif - // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, - // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - // scalv +#endif + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv #if 0 - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx - ); + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + //set + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + cntx + ); - // Initialize level-3 blocksize objects with architecture-specific values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); /* - Multi Instance performance improvement of DGEMM when binded to a CCX - In Multi instance each thread runs a sequential DGEMM. + Multi Instance performance improvement of DGEMM when binded to a CCX + In Multi instance each thread runs a sequential DGEMM. - a) If BLIS is run in a multi-instance mode with - CPU freq 2.6/2.2 Ghz - DDR4 clock frequency 2400Mhz - mc = 240, kc = 512, and nc = 2040 - has better performance on EPYC server, over the default block sizes. + a) If BLIS is run in a multi-instance mode with + CPU freq 2.6/2.2 Ghz + DDR4 clock frequency 2400Mhz + mc = 240, kc = 512, and nc = 2040 + has better performance on EPYC server, over the default block sizes. - b) If BLIS is run in Single Instance mode - mc = 510, kc = 1024 and nc = 4080 + b) If BLIS is run in Single Instance mode + mc = 510, kc = 1024 and nc = 4080 */ #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES - #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES + #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 510, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 1024, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); - #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + #else + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); - #endif + #endif #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #endif - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); - // Update the context with the current architecture's register and cache - // blocksizes for level-3 TRSM execution. - bli_cntx_set_trsm_blkszs - ( - 5, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM execution. + bli_cntx_set_trsm_blkszs + ( + 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); - // ------------------------------------------------------------------------- + // ------------------------------------------------------------------------- - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 ); + // Initialize sup thresholds with architecture-appropriate values. + // s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, 220, 110 ); - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 1, - BLIS_GEMM, bli_gemmsup_ref, - cntx - ); + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 1, + BLIS_GEMM, bli_gemmsup_ref, + cntx + ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 28, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 28, + //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); } diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c index f2236f3c1..129a69f00 100644 --- a/config/zen2/bli_cntx_init_zen2.c +++ b/config/zen2/bli_cntx_init_zen2.c @@ -37,234 +37,236 @@ void bli_cntx_init_zen2( cntx_t* cntx ) { - blksz_t blkszs[ BLIS_NUM_BLKSZS ]; - blksz_t thresh[ BLIS_NUM_THRESH ]; - // Set default kernel blocksizes and functions. - bli_cntx_init_zen2_ref( cntx ); + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + blksz_t thresh[ BLIS_NUM_THRESH ]; + // Set default kernel blocksizes and functions. + bli_cntx_init_zen2_ref( cntx ); - // ------------------------------------------------------------------------- + // ------------------------------------------------------------------------- - // Update the context with optimized native gemm micro-kernels and - // their storage preferences. - bli_cntx_set_l3_nat_ukrs - ( - 8, - // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, - BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, - BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, - // gemmtrsm_l - BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, - // gemmtrsm_u - BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, - BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, - cntx - ); + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 8, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, + // gemmtrsm_l + BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, + // gemmtrsm_u + BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, + BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, + cntx + ); - // packm kernels - bli_cntx_set_packm_kers - ( - 2, - BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, - BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, - cntx - ); + // packm kernels + bli_cntx_set_packm_kers + ( + 2, + BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, + BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, + cntx + ); - // Update the context with optimized level-1f kernels. - bli_cntx_set_l1f_kers - ( - 4, - // axpyf - BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, - BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, - // dotxf - BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, - BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, - cntx - ); + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 4, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx + ); - // Update the context with optimized level-1v kernels. - bli_cntx_set_l1v_kers - ( - 18, + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 20, #if 1 - // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif - // axpyv + // axpyv - // axpyv - BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, - BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + // axpyv + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, - // dotv - BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, - BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, - BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, - BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, + BLIS_DOTV_KER, BLIS_SCOMPLEX, bli_cdotv_zen_int5, + BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_zen_int5, - // dotxv - BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, - BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, - // scalv - BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, - BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, + // scalv + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, - //swap - BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, - BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, + //swap + BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, + BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, - //copy - BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, - BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, + //copy + BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, + BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, - //set - BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, - BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, - cntx - ); + //set + BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, + BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, + cntx + ); - // Initialize level-3 blocksize objects with architecture-specific values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); #if AOCL_BLIS_MULTIINSTANCE - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); #else - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); #endif - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); - // Update the context with the current architecture's register and cache - // blocksizes (and multiples) for native execution. - bli_cntx_set_blkszs - ( - BLIS_NAT, 7, - // level-3 - BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, - BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, - BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, - BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, - BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, - // level-1f - BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, - BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); // ------------------------------------------------------------------------- - //Initialize TRSM blocksize objects with architecture-specific values. - //Using different cache block sizes for TRSM instead of common level-3 block sizes. - //Tuning is done for double-precision only. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + //Initialize TRSM blocksize objects with architecture-specific values. + //Using different cache block sizes for TRSM instead of common level-3 block sizes. + //Tuning is done for double-precision only. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); - // Update the context with the current architecture's register and cache - // blocksizes for level-3 TRSM problems. - bli_cntx_set_trsm_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes for level-3 TRSM problems. + bli_cntx_set_trsm_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); - // Initialize sup thresholds with architecture-appropriate values. s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); + // Initialize sup thresholds with architecture-appropriate values. s d c z + bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); + bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, 256, 128 ); + bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, 220, 110 ); - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); + // Initialize the context with the sup thresholds. + bli_cntx_set_l3_sup_thresh + ( + 3, + BLIS_MT, &thresh[ BLIS_MT ], + BLIS_NT, &thresh[ BLIS_NT ], + BLIS_KT, &thresh[ BLIS_KT ], + cntx + ); - // Initialize the context with the sup handlers. - bli_cntx_set_l3_sup_handlers - ( - 2, - BLIS_GEMM, bli_gemmsup_ref, - BLIS_GEMMT, bli_gemmtsup_ref, - cntx - ); + // Initialize the context with the sup handlers. + bli_cntx_set_l3_sup_handlers + ( + 2, + BLIS_GEMM, bli_gemmsup_ref, + BLIS_GEMMT, bli_gemmtsup_ref, + cntx + ); - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 28, - //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, - BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, - BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, - BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, - BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, - BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, - BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, - BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, - BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, - BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, - cntx - ); + // Update the context with optimized small/unpacked gemm kernels. + bli_cntx_set_l3_sup_kers + ( + 28, + //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, + BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, + BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, + BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, + BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, + BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, + BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, + BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, + BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, + BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, + BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, + BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, + BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, + cntx + ); - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, - 9, 9, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); + // Initialize level-3 sup blocksize objects with architecture-specific + // values. + // s d c z + bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, + 9, 9, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); + // Update the context with the current architecture's register and cache + // blocksizes for small/unpacked level-3 problems. + bli_cntx_set_l3_sup_blkszs + ( + 5, + BLIS_NC, &blkszs[ BLIS_NC ], + BLIS_KC, &blkszs[ BLIS_KC ], + BLIS_MC, &blkszs[ BLIS_MC ], + BLIS_NR, &blkszs[ BLIS_NR ], + BLIS_MR, &blkszs[ BLIS_MR ], + cntx + ); } diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c index 75c7e5834..3f9f2989d 100644 --- a/config/zen3/bli_cntx_init_zen3.c +++ b/config/zen3/bli_cntx_init_zen3.c @@ -88,7 +88,7 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( - 18, + 20, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, @@ -99,6 +99,8 @@ void bli_cntx_init_zen3( cntx_t* cntx ) // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_SCOMPLEX, bli_caxpyv_zen_int5, + BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_zen_int5, // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c index 123341b2e..80793bda0 100644 --- a/frame/compat/bla_axpy.c +++ b/frame/compat/bla_axpy.c @@ -50,37 +50,37 @@ void PASTEF77(ch,blasname) \ ftype* y, const f77_int* incy \ ) \ { \ - dim_t n0; \ - ftype* x0; \ - ftype* y0; \ - inc_t incx0; \ - inc_t incy0; \ + dim_t n0; \ + ftype* x0; \ + ftype* y0; \ + inc_t incx0; \ + inc_t incy0; \ \ - /* Initialize BLIS. */ \ - bli_init_auto(); \ + /* Initialize BLIS. */ \ + bli_init_auto(); \ \ - /* Convert/typecast negative values of n to zero. */ \ - bli_convert_blas_dim1( *n, n0 ); \ + /* Convert/typecast negative values of n to zero. */ \ + bli_convert_blas_dim1( *n, n0 ); \ \ - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ \ - bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ - bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ \ + bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ + bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ - /* Call BLIS interface. */ \ - PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ - ( \ - BLIS_NO_CONJUGATE, \ - n0, \ - (ftype*)alpha, \ - x0, incx0, \ - y0, incy0, \ - NULL, \ - NULL \ - ); \ + /* Call BLIS interface. */ \ + PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ + ( \ + BLIS_NO_CONJUGATE, \ + n0, \ + (ftype*)alpha, \ + x0, incx0, \ + y0, incy0, \ + NULL, \ + NULL \ + ); \ \ - /* Finalize BLIS. */ \ - bli_finalize_auto(); \ + /* Finalize BLIS. */ \ + bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS @@ -94,65 +94,65 @@ void saxpy_ float* y, const f77_int* incy ) { - dim_t n0; - float* x0; - float* y0; - inc_t incx0; - inc_t incy0; + dim_t n0; + float* x0; + float* y0; + inc_t incx0; + inc_t incy0; - /* Initialize BLIS. */ -// bli_init_auto(); + /* Initialize BLIS. */ + // bli_init_auto(); - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((float*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((float*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((float*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((float*)y); - incy0 = ( inc_t )(*incy); - } + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((float*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((float*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((float*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((float*)y); + incy0 = ( inc_t )(*incy); + } - bli_saxpyv_zen_int10( - BLIS_NO_CONJUGATE, - n0, - (float*)alpha, - x0, incx0, - y0, incy0, - NULL - ); + bli_saxpyv_zen_int10( + BLIS_NO_CONJUGATE, + n0, + (float*)alpha, + x0, incx0, + y0, incy0, + NULL + ); - /* Finalize BLIS. */ -// bli_finalize_auto(); + /* Finalize BLIS. */ + // bli_finalize_auto(); } void daxpy_ @@ -163,68 +163,203 @@ void daxpy_ double* y, const f77_int* incy ) { - dim_t n0; - double* x0; - double* y0; - inc_t incx0; - inc_t incy0; + dim_t n0; + double* x0; + double* y0; + inc_t incx0; + inc_t incy0; - /* Initialize BLIS. */ -// bli_init_auto(); + /* Initialize BLIS. */ + // bli_init_auto(); - /* Convert/typecast negative values of n to zero. */ - if ( *n < 0 ) n0 = ( dim_t )0; - else n0 = ( dim_t )(*n); + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); - /* If the input increments are negative, adjust the pointers so we can - use positive increments instead. */ - if ( *incx < 0 ) - { - /* The semantics of negative stride in BLAS are that the vector - operand be traversed in reverse order. (Another way to think - of this is that negative strides effectively reverse the order - of the vector, but without any explicit data movements.) This - is also how BLIS interprets negative strides. The differences - is that with BLAS, the caller *always* passes in the 0th (i.e., - top-most or left-most) element of the vector, even when the - stride is negative. By contrast, in BLIS, negative strides are - used *relative* to the vector address as it is given. Thus, in - BLIS, if this backwards traversal is desired, the caller *must* - pass in the address to the (n-1)th (i.e., the bottom-most or - right-most) element along with a negative stride. */ - x0 = ((double*)x) + (n0-1)*(-*incx); - incx0 = ( inc_t )(*incx); - } - else - { - x0 = ((double*)x); - incx0 = ( inc_t )(*incx); - } - if ( *incy < 0 ) - { - y0 = ((double*)y) + (n0-1)*(-*incy); - incy0 = ( inc_t )(*incy); - } - else - { - y0 = ((double*)y); - incy0 = ( inc_t )(*incy); - } + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((double*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((double*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((double*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((double*)y); + incy0 = ( inc_t )(*incy); + } - bli_daxpyv_zen_int10( - BLIS_NO_CONJUGATE, - n0, - (double*)alpha, - x0, incx0, - y0, incy0, - NULL - ); + bli_daxpyv_zen_int10( + BLIS_NO_CONJUGATE, + n0, + (double*)alpha, + x0, incx0, + y0, incy0, + NULL + ); - /* Finalize BLIS. */ -// bli_finalize_auto(); + /* Finalize BLIS. */ + // bli_finalize_auto(); } -INSERT_GENTFUNC_BLAS_CZ( axpy, axpyv ) +void caxpy_ + ( + const f77_int* n, + const scomplex* alpha, + const scomplex* x, const f77_int* incx, + scomplex* y, const f77_int* incy + ) +{ + dim_t n0; + scomplex* x0; + scomplex* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ + // bli_init_auto(); + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((scomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((scomplex*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((scomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((scomplex*)y); + incy0 = ( inc_t )(*incy); + } + + bli_caxpyv_zen_int5( + BLIS_NO_CONJUGATE, + n0, + (scomplex*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ + // bli_finalize_auto(); +} + +void zaxpy_ + ( + const f77_int* n, + const dcomplex* alpha, + const dcomplex* x, const f77_int* incx, + dcomplex* y, const f77_int* incy + ) +{ + dim_t n0; + dcomplex* x0; + dcomplex* y0; + inc_t incx0; + inc_t incy0; + + /* Initialize BLIS. */ + // bli_init_auto(); + + /* Convert/typecast negative values of n to zero. */ + if ( *n < 0 ) n0 = ( dim_t )0; + else n0 = ( dim_t )(*n); + + /* If the input increments are negative, adjust the pointers so we can + use positive increments instead. */ + if ( *incx < 0 ) + { + /* The semantics of negative stride in BLAS are that the vector + operand be traversed in reverse order. (Another way to think + of this is that negative strides effectively reverse the order + of the vector, but without any explicit data movements.) This + is also how BLIS interprets negative strides. The differences + is that with BLAS, the caller *always* passes in the 0th (i.e., + top-most or left-most) element of the vector, even when the + stride is negative. By contrast, in BLIS, negative strides are + used *relative* to the vector address as it is given. Thus, in + BLIS, if this backwards traversal is desired, the caller *must* + pass in the address to the (n-1)th (i.e., the bottom-most or + right-most) element along with a negative stride. */ + x0 = ((dcomplex*)x) + (n0-1)*(-*incx); + incx0 = ( inc_t )(*incx); + } + else + { + x0 = ((dcomplex*)x); + incx0 = ( inc_t )(*incx); + } + if ( *incy < 0 ) + { + y0 = ((dcomplex*)y) + (n0-1)*(-*incy); + incy0 = ( inc_t )(*incy); + } + else + { + y0 = ((dcomplex*)y); + incy0 = ( inc_t )(*incy); + } + + bli_zaxpyv_zen_int5( + BLIS_NO_CONJUGATE, + n0, + (dcomplex*)alpha, + x0, incx0, + y0, incy0, + NULL + ); + + /* Finalize BLIS. */ + // bli_finalize_auto(); +} #else INSERT_GENTFUNC_BLAS( axpy, axpyv ) diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c index 873b7da53..936921c6b 100644 --- a/kernels/zen/1/bli_axpyv_zen_int10.c +++ b/kernels/zen/1/bli_axpyv_zen_int10.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc. - Copyright (C) 2018, The University of Texas at Austin + Copyright (C) 2018 - 2020, The University of Texas at Austin. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -41,16 +41,16 @@ One 256-bit AVX register holds 8 SP elements. */ typedef union { - __m256 v; - float f[8] __attribute__((aligned(64))); + __m256 v; + float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { - __m256d v; - double d[4] __attribute__((aligned(64))); + __m256d v; + double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- @@ -65,198 +65,198 @@ void bli_saxpyv_zen_int10 cntx_t* restrict cntx ) { - const dim_t n_elem_per_reg = 8; + const dim_t n_elem_per_reg = 8; - dim_t i; + dim_t i; - float* restrict x0; - float* restrict y0; + float* restrict x0; + float* restrict y0; - __m256 alphav; - __m256 xv[10]; - __m256 yv[10]; - __m256 zv[10]; + __m256 alphav; + __m256 xv[10]; + __m256 yv[10]; + __m256 zv[10]; - // If the vector dimension is zero, or if alpha is zero, return early. - if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return; + // If the vector dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return; - // Initialize local pointers. - x0 = x; - y0 = y; + // Initialize local pointers. + x0 = x; + y0 = y; - if ( incx == 1 && incy == 1 ) - { - // Broadcast the alpha scalar to all elements of a vector register. - alphav = _mm256_broadcast_ss( alpha ); + if ( incx == 1 && incy == 1 ) + { + // Broadcast the alpha scalar to all elements of a vector register. + alphav = _mm256_broadcast_ss( alpha ); - for ( i = 0; (i + 79) < n; i += 80 ) - { - // 80 elements will be processed per loop; 10 FMAs will run per loop. - xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); - xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); - xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); - xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); - xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); - xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg ); - xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg ); + for ( i = 0; (i + 79) < n; i += 80 ) + { + // 80 elements will be processed per loop; 10 FMAs will run per loop. + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); + xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); + xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); + xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); + xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg ); + xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg ); - yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); - yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); - yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); - yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); - yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); - yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg ); - yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg ); + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); + yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); + yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); + yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); + yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg ); + yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg ); - zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); - zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); - zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] ); - zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] ); - zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] ); - zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] ); - zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] ); + zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); + zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); + zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] ); + zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] ); + zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] ); + zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] ); + zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] ); - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); - _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); - _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] ); - _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] ); - _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] ); - _mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] ); - _mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] ); + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); + _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] ); + _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] ); + _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] ); + _mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] ); + _mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] ); - x0 += 10*n_elem_per_reg; - y0 += 10*n_elem_per_reg; - } + x0 += 10*n_elem_per_reg; + y0 += 10*n_elem_per_reg; + } - for ( ; (i + 39) < n; i += 40 ) - { - xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); - xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); + for ( ; (i + 39) < n; i += 40 ) + { + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); - yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); - yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); - zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); - zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); + zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); + zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); - _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); - x0 += 5*n_elem_per_reg; - y0 += 5*n_elem_per_reg; - } + x0 += 5*n_elem_per_reg; + y0 += 5*n_elem_per_reg; + } - for ( ; (i + 31) < n; i += 32 ) - { - xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + for ( ; (i + 31) < n; i += 32 ) + { + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); - yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); - zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); + zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); - x0 += 4*n_elem_per_reg; - y0 += 4*n_elem_per_reg; - } + x0 += 4*n_elem_per_reg; + y0 += 4*n_elem_per_reg; + } - for ( ; (i + 15) < n; i += 16 ) - { - xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + for ( ; (i + 15) < n; i += 16 ) + { + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); - yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); - zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); + zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); - x0 += 2*n_elem_per_reg; - y0 += 2*n_elem_per_reg; - } + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } - for ( ; (i + 7) < n; i += 8 ) - { - xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + for ( ; (i + 7) < n; i += 8 ) + { + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); - yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); - zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); + zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); - _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); - x0 += 1*n_elem_per_reg; - y0 += 1*n_elem_per_reg; - } + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } - // Issue vzeroupper instruction to clear upper lanes of ymm registers. - // This avoids a performance penalty caused by false dependencies when - // transitioning from from AVX to SSE instructions (which may occur - // as soon as the n_left cleanup loop below if BLIS is compiled with - // -mfpmath=sse). - _mm256_zeroupper(); + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); - for ( ; (i + 0) < n; i += 1 ) - { - *y0 += (*alpha) * (*x0); + for ( ; (i + 0) < n; i += 1 ) + { + *y0 += (*alpha) * (*x0); - x0 += 1; - y0 += 1; - } - } - else - { - const float alphac = *alpha; + x0 += 1; + y0 += 1; + } + } + else + { + const float alphac = *alpha; - for ( i = 0; i < n; ++i ) - { - const float x0c = *x0; + for ( i = 0; i < n; ++i ) + { + const float x0c = *x0; - *y0 += alphac * x0c; + *y0 += alphac * x0c; - x0 += incx; - y0 += incy; - } - } + x0 += incx; + y0 += incy; + } + } } // ----------------------------------------------------------------------------- @@ -271,197 +271,790 @@ void bli_daxpyv_zen_int10 cntx_t* restrict cntx ) { - const dim_t n_elem_per_reg = 4; + const dim_t n_elem_per_reg = 4; - dim_t i; + dim_t i; - double* restrict x0 = x; - double* restrict y0 = y; + double* restrict x0 = x; + double* restrict y0 = y; - __m256d alphav; - __m256d xv[10]; - __m256d yv[10]; - __m256d zv[10]; + __m256d alphav; + __m256d xv[10]; + __m256d yv[10]; + __m256d zv[10]; - // If the vector dimension is zero, or if alpha is zero, return early. - if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return; + // If the vector dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return; - // Initialize local pointers. - x0 = x; - y0 = y; + // Initialize local pointers. + x0 = x; + y0 = y; - if ( incx == 1 && incy == 1 ) - { - // Broadcast the alpha scalar to all elements of a vector register. - alphav = _mm256_broadcast_sd( alpha ); + if ( incx == 1 && incy == 1 ) + { + // Broadcast the alpha scalar to all elements of a vector register. + alphav = _mm256_broadcast_sd( alpha ); - for ( i = 0; (i + 39) < n; i += 40 ) - { - // 40 elements will be processed per loop; 10 FMAs will run per loop. - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); - xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); - xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); - xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); - xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); - xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg ); - xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg ); + for ( i = 0; (i + 39) < n; i += 40 ) + { + // 40 elements will be processed per loop; 10 FMAs will run per loop. + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); + xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); + xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); + xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); + xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg ); + xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg ); - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); - yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); - yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); - yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); - yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); - yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg ); - yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg ); + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); + yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); + yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); + yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); + yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg ); + yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg ); - zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); - zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); - zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] ); - zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] ); - zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] ); - zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] ); - zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] ); + zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); + zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); + zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] ); + zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] ); + zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] ); + zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] ); + zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); - _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); - _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] ); - _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] ); - _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] ); - _mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] ); - _mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] ); + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); + _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] ); + _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] ); + _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] ); + _mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] ); + _mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] ); - x0 += 10*n_elem_per_reg; - y0 += 10*n_elem_per_reg; - } + x0 += 10*n_elem_per_reg; + y0 += 10*n_elem_per_reg; + } - for ( ; (i + 19) < n; i += 20 ) - { - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); - xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); + for ( ; (i + 19) < n; i += 20 ) + { + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); - yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); - zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); - zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); + zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); + zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); - _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); - x0 += 5*n_elem_per_reg; - y0 += 5*n_elem_per_reg; - } + x0 += 5*n_elem_per_reg; + y0 += 5*n_elem_per_reg; + } - for ( ; (i + 15) < n; i += 16 ) - { - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); - xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + for ( ; (i + 15) < n; i += 16 ) + { + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); - yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); - zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); - zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); - zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); + zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); + zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); + zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); - _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); - _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); - x0 += 4*n_elem_per_reg; - y0 += 4*n_elem_per_reg; - } + x0 += 4*n_elem_per_reg; + y0 += 4*n_elem_per_reg; + } - for ( ; i + 7 < n; i += 8 ) - { - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + for ( ; i + 7 < n; i += 8 ) + { + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); - zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); - zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); + zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); + zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); - _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); - x0 += 2*n_elem_per_reg; - y0 += 2*n_elem_per_reg; - } + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } - for ( ; i + 3 < n; i += 4 ) - { - xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + for ( ; i + 3 < n; i += 4 ) + { + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); - yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); - zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); + zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); - _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); - x0 += 1*n_elem_per_reg; - y0 += 1*n_elem_per_reg; - } + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } - // Issue vzeroupper instruction to clear upper lanes of ymm registers. - // This avoids a performance penalty caused by false dependencies when - // transitioning from from AVX to SSE instructions (which may occur - // as soon as the n_left cleanup loop below if BLIS is compiled with - // -mfpmath=sse). - _mm256_zeroupper(); + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); - for ( ; i < n; i += 1 ) - { - *y0 += (*alpha) * (*x0); + for ( ; i < n; i += 1 ) + { + *y0 += (*alpha) * (*x0); - y0 += 1; - x0 += 1; - } - } - else - { - const double alphac = *alpha; + y0 += 1; + x0 += 1; + } + } + else + { + const double alphac = *alpha; - for ( i = 0; i < n; ++i ) - { - const double x0c = *x0; + for ( i = 0; i < n; ++i ) + { + const double x0c = *x0; - *y0 += alphac * x0c; + *y0 += alphac * x0c; - x0 += incx; - y0 += incy; - } - } + x0 += incx; + y0 += incy; + } + } } +// ----------------------------------------------------------------------------- + +void bli_caxpyv_zen_int5 + ( + conj_t conjx, + dim_t n, + scomplex* restrict alpha, + scomplex* restrict x, inc_t incx, + scomplex* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t n_elem_per_reg = 8; + + dim_t i; + + float* restrict x0; + float* restrict y0; + float* restrict alpha0; + + float alphaR, alphaI; + + //scomplex alpha => aR + aI i + __m256 alphaRv; // for braodcast vector aR (real part of alpha) + __m256 alphaIv; // for braodcast vector aI (imaginary part of alpha) + __m256 xv[10]; + __m256 xShufv[10]; + __m256 yv[10]; + + conj_t conjx_use = conjx; + + // If the vector dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(c,eq0)( *alpha ) ) return; + + // Initialize local pointers. + x0 = (float*)x; + y0 = (float*)y; + alpha0 = (float*)alpha; + + alphaR = alpha->real; + alphaI = alpha->imag; + + if ( incx == 1 && incy == 1 ) + { + // Broadcast the alpha scalar to all elements of a vector register. + if ( !bli_is_conj (conjx) ) // If BLIS_NO_CONJUGATE + { + alphaRv = _mm256_broadcast_ss( &alphaR ); + + alphaIv = _mm256_set_ps(alphaI, -alphaI, alphaI, -alphaI, alphaI, -alphaI, alphaI, -alphaI); + + } + else + { + alphaIv = _mm256_broadcast_ss( &alphaI ); + + alphaRv = _mm256_set_ps(-alphaR, alphaR, -alphaR, alphaR, -alphaR, alphaR, -alphaR, alphaR); + } + + //----------Scalar algorithm BLIS_NO_CONJUGATE arg------------- + // y = alpha*x + y + // y = (aR + aIi) * (xR + xIi) + (yR + yIi) + // y = aR.xR + aR.xIi + aIi.xR - aIxI + (yR + yIi) + // y = aR.xR - aIxI + yR + aR.xIi + xR.aIi + yIi + // y = ( aR.xR - aIxI + yR ) + ( aR.xI + aI.xR + yI )i + + // SIMD algorithm + // xv = xR1 xI1 xR2 xI2 xR3 xI3 xR4 xI4 + // xv' = xI1 xR1 xI2 xR2 xI3 xR3 xI4 xR4 (shuffle xv) + // arv = aR aR aR aR aR aR aR aR + // aiv = aI -aI aI -aI aI -aI aI -aI + // yv = yR1 yI1 yR2 yI2 yR3 yI3 yR4 yI4 + + + //----------Scalar algorithm for BLIS_CONJUGATE arg------------- + // y = alpha*conj(x) + y + // y = (aR + aIi) * (xR - xIi) + (yR + yIi) + // y = aR.xR - aR.xIi + aIi.xR + aIxI + (yR + yIi) + // y = aR.xR + aIxI + yR - aR.xIi + xR.aIi + yIi + // y = ( aR.xR + aIxI + yR ) + ( -aR.xI + aI.xR + yI )i + // y = ( aR.xR + aIxI + yR ) + (aI.xR - aR.xI + yI)i + + // SIMD algorithm + // xv = xR1 xI1 xR2 xI2 xR3 xI3 xR4 xI4 + // xv' = xI1 xR1 xI2 xR2 xI3 xR3 xI4 xR4 + // arv = aR -aR aR -aR aR -aR aR -aR + // aiv = aI aI aI aI aI aI aI aI + // yv = yR1 yI1 yR2 yI2 yR3 yI3 yR4 yI4 + + // step 1 : Shuffle xv vector -> xv' + // step 2 : fma yv = arv*xv + yv + // step 3 : fma yv = aiv*xv' + yv (old) + // yv = aiv*xv' + arv*xv + yv + + for ( i= 0 ; (i + 19) < n; i += 20 ) + { + // 20 elements will be processed per loop; 10 FMAs will run per loop. + + // alphaRv = aR aR aR aR aR aR aR aR + // xv = xR1 xI1 xR2 xI2 xR3 xI3 xR4 xI4 + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 yR3 yI3 yR4 yI4 + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); + + // xv' = xI1 xR1 xI2 xR2 xI3 xR3 xI4 xR4 + xShufv[0] = _mm256_permute_ps( xv[0], 0xB1); + xShufv[1] = _mm256_permute_ps( xv[1], 0xB1); + xShufv[2] = _mm256_permute_ps( xv[2], 0xB1); + xShufv[3] = _mm256_permute_ps( xv[3], 0xB1); + xShufv[4] = _mm256_permute_ps( xv[4], 0xB1); + + // alphaIv = -aI aI -aI aI -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_ps( xv[0], alphaRv ,yv[0]); + yv[1] = _mm256_fmadd_ps( xv[1], alphaRv ,yv[1]); + yv[2] = _mm256_fmadd_ps( xv[2], alphaRv ,yv[2]); + yv[3] = _mm256_fmadd_ps( xv[3], alphaRv ,yv[3]); + yv[4] = _mm256_fmadd_ps( xv[4], alphaRv ,yv[4]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, ......... + yv[0] = _mm256_fmadd_ps( xShufv[0], alphaIv, yv[0]); + yv[1] = _mm256_fmadd_ps( xShufv[1], alphaIv, yv[1]); + yv[2] = _mm256_fmadd_ps( xShufv[2], alphaIv, yv[2]); + yv[3] = _mm256_fmadd_ps( xShufv[3], alphaIv, yv[3]); + yv[4] = _mm256_fmadd_ps( xShufv[4], alphaIv, yv[4]); + + // Store back the results + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), yv[1] ); + _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), yv[2] ); + _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), yv[3] ); + _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), yv[4] ); + + x0 += 5*n_elem_per_reg; + y0 += 5*n_elem_per_reg; + } + + for ( ; (i + 7) < n; i += 8 ) + { + // alphaRv = aR aR aR aR aR aR aR aR + // xv = xR1 xI1 xR2 xI2 xR3 xI3 xR4 xI4 + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 yR3 yI3 yR4 yI4 + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); + + // xv' = xI1 xR1 xI2 xR2 xI3 xR3 xI4 xR4 + xShufv[0] = _mm256_permute_ps( xv[0], 0xB1); + xShufv[1] = _mm256_permute_ps( xv[1], 0xB1); + + // alphaIv = -aI aI -aI aI -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_ps( xv[0], alphaRv ,yv[0]); + yv[1] = _mm256_fmadd_ps( xv[1], alphaRv ,yv[1]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, ......... + yv[0] = _mm256_fmadd_ps( xShufv[0], alphaIv, yv[0]); + yv[1] = _mm256_fmadd_ps( xShufv[1], alphaIv, yv[1]); + + // Store back the result + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] ); + _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), yv[1] ); + + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } + + for ( ; (i + 3) < n; i += 4 ) + { + // alphaRv = aR aR aR aR aR aR aR aR + // xv = xR1 xI1 xR2 xI2 xR3 xI3 xR4 xI4 + xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 yR3 yI3 yR4 yI4 + yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); + + // xv' = xI1 xR1 xI2 xR2 xI3 xR3 xI4 xR4 + xShufv[0] = _mm256_permute_ps( xv[0], 0xB1); + + // alphaIv = -aI aI -aI aI -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_ps( xv[0], alphaRv ,yv[0]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = aR.xR1 - aI*xI1 + yR1, aR.xI1 + aI.xR1 + yI1 + yv[0] = _mm256_fmadd_ps( xShufv[0], alphaIv, yv[0]); + + // Store back the result + _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] ); + + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + + /* Residual values are calculated here + y0 += (alpha) * (x0); --> BLIS_NO_CONJUGATE + y0 += ( aR.xR - aIxI + yR ) + ( aR.xI + aI.xR + yI )i + + y0 += (alpha) * conjx(x0); --> BLIS_CONJUGATE + y0 = ( aR.xR + aIxI + yR ) + (aI.xR - aR.xI + yI)i */ + + if ( !bli_is_conj(conjx_use) ) // BLIS_NO_CONJUGATE + { + for ( ; (i + 0) < n; i += 1 ) + { + // real part: ( aR.xR - aIxI + yR ) + *y0 += *alpha0 * (*x0) - (*(alpha0 + 1)) * (*(x0+1)); + // img part: ( aR.xI + aI.xR + yI ) + *(y0 + 1) += *alpha0 * (*(x0+1)) + (*(alpha0 + 1)) * (*x0); + x0 += 2; + y0 += 2; + } + } + else // BLIS_CONJUGATE + { + for ( ; (i + 0) < n; i += 1 ) + { + // real part: ( aR.xR + aIxI + yR ) + *y0 += *alpha0 * (*x0) + (*(alpha0 + 1)) * (*(x0+1)); + // img part: ( aI.xR - aR.xI + yI ) + *(y0 + 1) += (*(alpha0 + 1)) * (*x0) - (*alpha0) * (*(x0+1)); + x0 += 2; + y0 += 2; + } + } + + } + else + { + const float alphar = *alpha0; + const float alphai = *(alpha0 + 1); + + if ( !bli_is_conj(conjx_use) ) + { + for ( i = 0; i < n; ++i ) + { + const float x0c = *x0; + const float x1c = *( x0+1 ); + + *y0 += alphar * x0c - alphai * x1c; + *(y0 + 1) += alphar * x1c + alphai * x0c; + + x0 += incx * 2; + y0 += incy * 2; + } + } + else + { + for ( i = 0; i < n; ++i ) + { + const float x0c = *x0; + const float x1c = *( x0+1 ); + + *y0 += alphar * x0c + alphai * x1c; + *(y0 + 1) += alphai * x0c - alphar * x1c; + + x0 += incx * 2; + y0 += incy * 2; + } + } + + } +} + +// ----------------------------------------------------------------------------- + +void bli_zaxpyv_zen_int5 + ( + conj_t conjx, + dim_t n, + dcomplex* restrict alpha, + dcomplex* restrict x, inc_t incx, + dcomplex* restrict y, inc_t incy, + cntx_t* restrict cntx + ) +{ + const dim_t n_elem_per_reg = 4; + + dim_t i; + + double* restrict x0; + double* restrict y0; + double* restrict alpha0; + + double alphaR, alphaI; + + __m256d alphaRv; // for braodcast vector aR (real part of alpha) + __m256d alphaIv; // for braodcast vector aI (imaginary part of alpha) + __m256d xv[5]; + __m256d xShufv[5]; + __m256d yv[5]; + + conj_t conjx_use = conjx; + + // If the vector dimension is zero, or if alpha is zero, return early. + if ( bli_zero_dim1( n ) || PASTEMAC(z,eq0)( *alpha ) ) return; + + // Initialize local pointers. + x0 = (double*)x; + y0 = (double*)y; + alpha0 = (double*)alpha; + + alphaR = alpha->real; + alphaI = alpha->imag; + + if ( incx == 1 && incy == 1 ) + { + // Broadcast the alpha scalar to all elements of a vector register. + if ( !bli_is_conj (conjx) ) // If BLIS_NO_CONJUGATE + { + alphaRv = _mm256_broadcast_sd( &alphaR ); + + alphaIv[0] = -alphaI; + alphaIv[1] = alphaI; + alphaIv[2] = -alphaI; + alphaIv[3] = alphaI; + } + else + { + alphaIv = _mm256_broadcast_sd( &alphaI ); + + alphaRv[0] = alphaR; + alphaRv[1] = -alphaR; + alphaRv[2] = alphaR; + alphaRv[3] = -alphaR; + } + + // --------Scalar algorithm BLIS_NO_CONJUGATE arg------------- + // y = alpha*x + y + // y = (aR + aIi) * (xR + xIi) + (yR + yIi) + // y = aR.xR + aR.xIi + aIi.xR - aIxI + (yR + yIi) + // y = aR.xR - aIxI + yR + aR.xIi + xR.aIi + yIi + // y = ( aR.xR - aIxI + yR ) + ( aR.xI + aI.xR + yI )i + + // SIMD algorithm + // xv = xR1 xI1 xR2 xI2 + // xv' = xI1 xR1 xI2 xR2 + // arv = aR aR aR aR + // aiv = -aI aI -aI aI + // yv = yR1 yI1 yR2 yI2 + + // S1 : xv' = xI1 xR1 xI2 xR2 (Shuffle) + // S2 : reg0 = (aR.xR1 aR.xI1 aR.xR2 aR.xI2) + yv + // S3 : reg1 = (-aI.xI1 aI.xR1 -aI.xI2 aI.xR2) + reg0 + //---------------------------------------------------------------- + // Ans : aR.xR1 -aI.xI1 + yR1, aR.xI1 + aI.xR1 + yI1, aR.xR2 -aI.xI2 + yR2, aR.xI2 + aI.xR2 + yI2 + + //----------Scalar algorithm for BLIS_CONJUGATE arg------------- + // y = alpha*conj(x) + y + // y = (aR + aIi) * (xR - xIi) + (yR + yIi) + // y = aR.xR - aR.xIi + aIi.xR + aIxI + (yR + yIi) + // y = aR.xR + aIxI + yR - aR.xIi + xR.aIi + yIi + // y = ( aR.xR + aIxI + yR ) + ( -aR.xI + aI.xR + yI )i + // y = ( aR.xR + aIxI + yR ) + (aI.xR - aR.xI + yI)i + + // SIMD algorithm + // xv = xR1 xI1 xR2 xI2 + // xv' = xI1 xR1 xI2 xR2 + // arv = aR -aR aR -aR + // aiv = aI aI aI aI + // yv = yR1 yI1 yR2 yI2 + + // step 1 : Shuffle xv vector + // reg xv : xv' = xI1 xR1 xI2 xR2 + // step 2 : fma :yv = ar*xv + yv = ar*xv + yv + // step 3 : fma :yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + + for ( i = 0; (i + 9) < n; i += 10 ) + { + // 10 elements will be processed per loop; 10 FMAs will run per loop. + + // alphaRv = aR aR aR aR + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); + xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); + xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); + yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); + yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); + + // xv' = xI1 xRI xI2 xR2 + xShufv[0] = _mm256_permute_pd( xv[0], 5); + xShufv[1] = _mm256_permute_pd( xv[1], 5); + xShufv[2] = _mm256_permute_pd( xv[2], 5); + xShufv[3] = _mm256_permute_pd( xv[3], 5); + xShufv[4] = _mm256_permute_pd( xv[4], 5); + + // alphaIv = -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_pd( xv[0], alphaRv ,yv[0]); + yv[1] = _mm256_fmadd_pd( xv[1], alphaRv ,yv[1]); + yv[2] = _mm256_fmadd_pd( xv[2], alphaRv ,yv[2]); + yv[3] = _mm256_fmadd_pd( xv[3], alphaRv ,yv[3]); + yv[4] = _mm256_fmadd_pd( xv[4], alphaRv ,yv[4]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, ......... + yv[0] = _mm256_fmadd_pd( xShufv[0], alphaIv, yv[0]); + yv[1] = _mm256_fmadd_pd( xShufv[1], alphaIv, yv[1]); + yv[2] = _mm256_fmadd_pd( xShufv[2], alphaIv, yv[2]); + yv[3] = _mm256_fmadd_pd( xShufv[3], alphaIv, yv[3]); + yv[4] = _mm256_fmadd_pd( xShufv[4], alphaIv, yv[4]); + + // Store back the result + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] ); + _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), yv[2] ); + _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), yv[3] ); + _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), yv[4] ); + + x0 += 5*n_elem_per_reg; + y0 += 5*n_elem_per_reg; + } + + for ( ; (i + 3) < n; i += 4 ) + { + // alphaRv = aR aR aR aR + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); + + // xv' = xI1 xRI xI2 xR2 + xShufv[0] = _mm256_permute_pd( xv[0], 5); + xShufv[1] = _mm256_permute_pd( xv[1], 5); + + // alphaIv = -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_pd( xv[0], alphaRv ,yv[0]); + yv[1] = _mm256_fmadd_pd( xv[1], alphaRv ,yv[1]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, ......... + yv[0] = _mm256_fmadd_pd( xShufv[0], alphaIv, yv[0]); + yv[1] = _mm256_fmadd_pd( xShufv[1], alphaIv, yv[1]); + + // Store back the result + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); + _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] ); + + x0 += 2*n_elem_per_reg; + y0 += 2*n_elem_per_reg; + } + + for ( ; (i + 3) < n; i += 2 ) + { + // alphaRv = aR aR aR aR + // xv = xR1 xI1 xR2 xI2 + xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); + + // yv = yR1 yI1 yR2 yI2 + yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); + + // xv' = xI1 xRI xI2 xR2 + xShufv[0] = _mm256_permute_pd( xv[0], 5); + + // alphaIv = -aI aI -aI aI + + // yv = ar*xv + yv + // = aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ... + yv[0] = _mm256_fmadd_pd( xv[0], alphaRv ,yv[0]); + + // yv = ai*xv' + yv (old) + // yv = ai*xv' + ar*xv + yv + // = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, ......... + yv[0] = _mm256_fmadd_pd( xShufv[0], alphaIv, yv[0]); + + // Store back the result + _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] ); + + x0 += 1*n_elem_per_reg; + y0 += 1*n_elem_per_reg; + } + + // Issue vzeroupper instruction to clear upper lanes of ymm registers. + // This avoids a performance penalty caused by false dependencies when + // transitioning from from AVX to SSE instructions (which may occur + // as soon as the n_left cleanup loop below if BLIS is compiled with + // -mfpmath=sse). + _mm256_zeroupper(); + + /* Residual values are calculated here + y0 += (alpha) * (x0); --> BLIS_NO_CONJUGATE + y0 += ( aR.xR - aIxI + yR ) + ( aR.xI + aI.xR + yI )i + + y0 += (alpha) * conjx(x0); --> BLIS_CONJUGATE + y0 = ( aR.xR + aIxI + yR ) + (aI.xR - aR.xI + yI)i */ + + if ( !bli_is_conj(conjx_use) ) // BLIS_NO_CONJUGATE + { + for ( ; (i + 0) < n; i += 1 ) + { + // real part: ( aR.xR - aIxI + yR ) + *y0 += *alpha0 * (*x0) - (*(alpha0 + 1)) * (*(x0+1)); + // img part: ( aR.xI + aI.xR + yI ) + *(y0 + 1) += *alpha0 * (*(x0+1)) + (*(alpha0 + 1)) * (*x0); + x0 += 2; + y0 += 2; + } + } + else // BLIS_CONJUGATE + { + for ( ; (i + 0) < n; i += 1 ) + { + // real part: ( aR.xR + aIxI + yR ) + *y0 += *alpha0 * (*x0) + (*(alpha0 + 1)) * (*(x0+1)); + // img part: ( aI.xR - aR.xI + yI ) + *(y0 + 1) += (*(alpha0 + 1)) * (*x0) - (*alpha0) * (*(x0+1)); + x0 += 2; + y0 += 2; + } + } + } + else + { + const double alphar = *alpha0; + const double alphai = *(alpha0 + 1); + + if ( !bli_is_conj(conjx_use) ) // BLIS_NO_CONJUGATE + { + for ( i = 0; i < n; ++i ) + { + const double x0c = *x0; + const double x1c = *( x0+1 ); + + *y0 += alphar * x0c - alphai * x1c; + *(y0 + 1) += alphar * x1c + alphai * x0c; + + x0 += incx * 2; + y0 += incy * 2; + } + } + else // BLIS_CONJUGATE + { + for ( i = 0; i < n; ++i ) + { + const double x0c = *x0; + const double x1c = *( x0+1 ); + + *y0 += alphar * x0c + alphai * x1c; + *(y0 + 1) += alphai * x0c - alphar * x1c; + + x0 += incx * 2; + y0 += incy * 2; + } + } + } +} \ No newline at end of file diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h index 8dbd1398e..c3453e85e 100644 --- a/kernels/zen/bli_kernels_zen.h +++ b/kernels/zen/bli_kernels_zen.h @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, Advanced Micro Devices, Inc. + Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -50,9 +50,11 @@ AMAXV_KER_PROT( double, d, amaxv_zen_int ) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) - // axpyv (intrinsics unrolled x10) - AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) - AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) +// axpyv (intrinsics unrolled x10) +AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) +AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) +AXPYV_KER_PROT( scomplex, c, axpyv_zen_int5 ) +AXPYV_KER_PROT( dcomplex, z, axpyv_zen_int5 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) @@ -72,9 +74,9 @@ DOTXV_KER_PROT( double, d, dotxv_zen_int ) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) - // scalv (intrinsics unrolled x10) - SCALV_KER_PROT( float, s, scalv_zen_int10 ) - SCALV_KER_PROT( double, d, scalv_zen_int10 ) +// scalv (intrinsics unrolled x10) +SCALV_KER_PROT( float, s, scalv_zen_int10 ) +SCALV_KER_PROT( double, d, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) @@ -124,7 +126,7 @@ GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) -GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) +GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) diff --git a/test/test_axpyv.c b/test/test_axpyv.c index 44a0d2d74..61ddb9c59 100644 --- a/test/test_axpyv.c +++ b/test/test_axpyv.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -42,168 +42,194 @@ // n alpha x incx y incy //void daxpyv_( int*, double*, double*, int*, double*, int* ); -//#define PRINT +// #define PRINT int main( int argc, char** argv ) { - obj_t x, y; - obj_t y_save; - obj_t alpha; - dim_t n; - dim_t p; - dim_t p_begin, p_end, p_inc; - int n_input; - num_t dt_x, dt_y; - num_t dt_alpha; - int r, n_repeats; - num_t dt; + obj_t x, y; + obj_t y_save; + obj_t alpha; + dim_t n; + dim_t p; + dim_t p_begin, p_end, p_inc; + int n_input; + num_t dt_x, dt_y; + num_t dt_alpha; + int r, n_repeats; + num_t dt; - double dtime; - double dtime_save; - double gflops; + double dtime; + double dtime_save; + double gflops; - bli_init(); + bli_init(); - n_repeats = 3; + n_repeats = 1; #ifndef PRINT - p_begin = 40; - p_end = 4000; - p_inc = 40; + p_begin = 10; + p_end = 100; + p_inc = 10; - n_input = -1; + n_input = -1; #else - p_begin = 16; - p_end = 16; - p_inc = 1; + p_begin = 16; + p_end = 16; + p_inc = 1; - n_input = 15; + n_input = 15; #endif #if 1 - dt = BLIS_FLOAT; - //dt = BLIS_DOUBLE; + dt = BLIS_FLOAT; + //dt = BLIS_DOUBLE; #else - //dt = BLIS_SCOMPLEX; - dt = BLIS_DCOMPLEX; + // dt = BLIS_SCOMPLEX; + // dt = BLIS_DCOMPLEX; #endif + dt_x = dt_y = dt_alpha = dt; - dt_x = dt_y = dt_alpha = dt; - - // Begin with initializing the last entry to zero so that - // matlab allocates space for the entire array once up-front. - for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; + // Begin with initializing the last entry to zero so that + // matlab allocates space for the entire array once up-front. + for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ; #ifdef BLIS - printf( "data_axpyv_blis" ); + printf( "data_axpyv_blis" ); #else - printf( "data_axpyv_%s", BLAS ); + printf( "data_axpyv_%s", BLAS ); #endif - printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin)/p_inc + 1, - ( unsigned long )0, 0.0 ); + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )0, 0.0 ); - //for ( p = p_begin; p <= p_end; p += p_inc ) - for ( p = p_end; p_begin <= p; p -= p_inc ) - { + //for ( p = p_begin; p <= p_end; p += p_inc ) + for ( p = p_end; p_begin <= p; p -= p_inc ) + { - if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); - else n = ( dim_t ) n_input; + if ( n_input < 0 ) n = p * ( dim_t )abs(n_input); + else n = ( dim_t ) n_input; - bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); + bli_obj_create( dt_alpha, 1, 1, 0, 0, &alpha ); - bli_obj_create( dt_x, n, 1, 0, 0, &x ); - bli_obj_create( dt_y, n, 1, 0, 0, &y ); - bli_obj_create( dt_y, n, 1, 0, 0, &y_save ); + bli_obj_create( dt_x, n, 1, 0, 0, &x ); + bli_obj_create( dt_y, n, 1, 0, 0, &y ); + bli_obj_create( dt_y, n, 1, 0, 0, &y_save ); - bli_randm( &x ); - bli_randm( &y ); + bli_randm( &x ); + bli_randm( &y ); - bli_setsc( (2.0/1.0), 0.0, &alpha ); + bli_setsc( (2.0/1.0), 0.0, &alpha ); - bli_copym( &y, &y_save ); + bli_copym( &y, &y_save ); - dtime_save = 1.0e9; + dtime_save = 1.0e9; - for ( r = 0; r < n_repeats; ++r ) - { - bli_copym( &y_save, &y ); + for ( r = 0; r < n_repeats; ++r ) + { + bli_copym( &y_save, &y ); - dtime = bli_clock(); + dtime = bli_clock(); #ifdef PRINT - bli_printm( "alpha", &alpha, "%4.1f", "" ); - bli_printm( "x", &x, "%4.1f", "" ); - bli_printm( "y", &y, "%4.1f", "" ); + bli_printm( "alpha", &alpha, "%4.1f", "" ); + bli_printm( "x", &x, "%4.1f", "" ); + bli_printm( "y", &y, "%4.1f", "" ); #endif #ifdef BLIS - bli_axpyv( &alpha, - &x, - &y ); + bli_axpyv( &alpha, + &x, + &y ); #else - if ( bli_is_float( dt ) ) - { - f77_int nn = bli_obj_length( &x ); - f77_int incx = bli_obj_vector_inc( &x ); - f77_int incy = bli_obj_vector_inc( &y ); - float* alphap = bli_obj_buffer( &alpha ); - float* xp = bli_obj_buffer( &x ); - float* yp = bli_obj_buffer( &y ); + if ( bli_is_float( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + float* alphap = bli_obj_buffer( &alpha ); + float* xp = bli_obj_buffer( &x ); + float* yp = bli_obj_buffer( &y ); - saxpy_( &nn, - alphap, - xp, &incx, - yp, &incy ); + saxpy_( &nn, + alphap, + xp, &incx, + yp, &incy ); + } + else if ( bli_is_double( dt ) ) + { - } - else if ( bli_is_double( dt ) ) - { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + double* alphap = bli_obj_buffer( &alpha ); + double* xp = bli_obj_buffer( &x ); + double* yp = bli_obj_buffer( &y ); - f77_int nn = bli_obj_length( &x ); - f77_int incx = bli_obj_vector_inc( &x ); - f77_int incy = bli_obj_vector_inc( &y ); - double* alphap = bli_obj_buffer( &alpha ); - double* xp = bli_obj_buffer( &x ); - double* yp = bli_obj_buffer( &y ); + daxpy_( &nn, + alphap, + xp, &incx, + yp, &incy ); + } + else if ( bli_is_scomplex( dt ) ) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + void* alphap = bli_obj_buffer( &alpha ); + void* xp = bli_obj_buffer( &x ); + void* yp = bli_obj_buffer( &y ); - daxpy_( &nn, - alphap, - xp, &incx, - yp, &incy ); - } + caxpy_( &nn, + (scomplex*)alphap, + (scomplex*)xp, &incx, + (scomplex*)yp, &incy ); + } + else if ( bli_is_dcomplex( dt )) + { + f77_int nn = bli_obj_length( &x ); + f77_int incx = bli_obj_vector_inc( &x ); + f77_int incy = bli_obj_vector_inc( &y ); + void* alphap = bli_obj_buffer( &alpha ); + void* xp = bli_obj_buffer( &x ); + void* yp = bli_obj_buffer( &y ); + + zaxpy_( &nn, + (dcomplex*)alphap, + (dcomplex*)xp, &incx, + (dcomplex*)yp, &incy ); + } #endif #ifdef PRINT - bli_printm( "y after", &y, "%4.1f", "" ); - exit(1); + bli_printm( "y after", &y, "%4.1f", "" ); + exit(1); #endif + dtime_save = bli_clock_min_diff( dtime_save, dtime ); + } - dtime_save = bli_clock_min_diff( dtime_save, dtime ); - } - - gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 ); + gflops = ( 2.0 * n ) / ( dtime_save * 1.0e9 ); + if ( bli_obj_is_complex( &x ) ) gflops *= 4.0; #ifdef BLIS - printf( "data_axpyv_blis" ); + printf( "data_axpyv_blis" ); #else - printf( "data_axpyv_%s", BLAS ); + printf( "data_axpyv_%s", BLAS ); #endif - printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", - ( unsigned long )(p - p_begin)/p_inc + 1, - ( unsigned long )n, gflops ); + printf( "( %2lu, 1:2 ) = [ %4lu %7.2f ];\n", + ( unsigned long )(p - p_begin)/p_inc + 1, + ( unsigned long )n, gflops ); - bli_obj_free( &alpha ); + bli_obj_free( &alpha ); - bli_obj_free( &x ); - bli_obj_free( &y ); - bli_obj_free( &y_save ); - } + bli_obj_free( &x ); + bli_obj_free( &y ); + bli_obj_free( &y_save ); + } - bli_finalize(); + bli_finalize(); - return 0; + return 0; }