diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h index c73409673..1a8c1234a 100644 --- a/config/amdzen/bli_family_amdzen.h +++ b/config/amdzen/bli_family_amdzen.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -61,5 +61,29 @@ //#define BLIS_ENABLE_FAST_MATH +/* + * Override the block sizes in the context to the block sizes used + * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default + * GEMM kernels are AVX512 based and uses different block sizes. + * + * This function should be called in TRSM path before performing + * any packing operations. + * + * Also the context must be restored to default values by calling + * bli_zen4_restore_default_blkszs() before exiting TRSM Path + */ +BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); + +/* + * Restore the block sizes to default values needed for zen4 context. + * + * This function should be called to restore the block sizes to there + * default values if they where overriden by calling + * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the + * TRSM path. + * + */ +BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx); + #endif diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index b3a6d1030..fc900a1e9 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -34,6 +34,24 @@ #include "blis.h" +/* + * List of default block sizes for zen4. + * Converted it to macro as this list is used at multiple places in this file. + */ + +#define BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs) \ + /* s d c z */ \ + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 ); \ + bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566, \ + 480, 320, 256, 566 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 ); \ + \ + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); \ + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); \ + + void bli_cntx_init_zen4( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; @@ -47,20 +65,23 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 4, + 10, // gemm BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, -#if 0 // GENOA TODO: TRSM AVX-512 implementation + + BLIS_GEMM_AVX2_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, + BLIS_GEMM_AVX2_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, -#endif + cntx ); @@ -115,7 +136,7 @@ void bli_cntx_init_zen4( cntx_t* cntx ) 24, // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512, + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512, // axpbyv @@ -162,17 +183,8 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // // These are reference block sizes and may be overridden based on // number of threads used at runtime. - - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 ); - bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 512, 256, 566, - 480, 320, 256, 566 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 4004, 4080, 256 ); - - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); + + BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. @@ -192,11 +204,14 @@ void bli_cntx_init_zen4( cntx_t* cntx ) ); // ------------------------------------------------------------------------- -#if 0 // GENOA TODO: TRSM AVX-512 implementation +#if 0 // Replaced with runtime blocksize override + //Initialize TRSM blocksize objects with architecture-specific values. //Using different cache block sizes for TRSM instead of common level-3 block sizes. //Tuning is done for double-precision only. // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); @@ -298,3 +313,72 @@ void bli_cntx_init_zen4( cntx_t* cntx ) cntx ); } + +/* + * Override the block sizes in the context to the block sizes used + * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default + * GEMM kernels are AVX512 based and uses different block sizes. + * + * This function should be called in TRSM path before performing + * any packing operations. + * + * Also the context must be restored to default values by calling + * bli_zen4_restore_default_blkszs() before exiting TRSM Path + */ +void bli_zen4_override_trsm_blkszs (cntx_t* cntx) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 ); + + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 5, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); +} + +/* + * Restore the block sizes to default values needed for zen4 context. + * + * This function should be called to restore the block sizes to there + * default values if they where overriden by calling + * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the + * TRSM path. + * + */ +void bli_zen4_restore_default_blkszs (cntx_t* cntx) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 7, + // level-3 + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, + cntx + ); +} \ No newline at end of file diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index 71929cdac..fad5f1698 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -62,4 +62,28 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 +/* + * Override the block sizes in the context to the block sizes used + * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default + * GEMM kernels are AVX512 based and uses different block sizes. + * + * This function should be called in TRSM path before performing + * any packing operations. + * + * Also the context must be restored to default values by calling + * bli_zen4_restore_default_blkszs() before exiting TRSM Path + */ +BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx); + +/* + * Restore the block sizes to default values needed for zen4 context. + * + * This function should be called to restore the block sizes to there + * default values if they where overriden by calling + * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the + * TRSM path. + * + */ +BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx); + #endif diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index c6a3c545f..75bec7018 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -73,11 +73,11 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) # gcc or clang version must be atleast 4.0 # gcc 9.0 or later: ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) -CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse CRVECFLAGS += -march=znver3 else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse CRVECFLAGS += -march=znver2 else # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 @@ -107,12 +107,12 @@ CRVECFLAGS += -march=znver4 else # for version 3x we will enable znver3 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) -CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse CRVECFLAGS += -march=znver3 else # for version 2x we will enable znver2 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) -CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse CRVECFLAGS += -march=znver2 else #if compiling with clang diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index f964faf0d..9eddd5c42 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,6 +36,7 @@ #include "blis.h" //#define PRINT_SMALL_TRSM_INFO + void bli_trsm_front ( side_t side, @@ -151,6 +152,24 @@ void bli_trsm_front // in bli_packm_init(). if ( bli_cntx_method( cntx ) == BLIS_NAT ) { +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) + /* Zen4 TRSM Fixme: + * + * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels + * for TRSM (Till we implemente TRSM AVX-512 kernels) + * + * The AVX2 kernels use different block sizes then AVX512 kernels + * Here we override the default block sizes in the context with AVX2 + * specific block size used in GEMMTRSM kernerls. + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ + if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) && + (bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) ) + { + bli_zen4_override_trsm_blkszs(cntx); + } +#endif bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); } @@ -177,6 +196,20 @@ void bli_trsm_front rntm, cntl ); + +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) + /* Zen4 TRSM Fixme: + * + * We have overrding the block sizes at the start of this function + * Since the context is created only once we need to ensure that the + * default block sizes are restored for the subsequent operations. + */ + if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4) && + (bli_obj_dt(a) == BLIS_FLOAT || bli_obj_dt(a) == BLIS_DOUBLE) ) + { + bli_zen4_restore_default_blkszs(cntx); + } +#endif AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); } diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index 5426348c8..fe39e6f47 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Zen4 TRSM Fixme: + * + * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels + * for TRSM (Till we implemente TRSM AVX-512 kernels) + * + * The AVX2 kernels for TRSM are enabled in the context, but they + * are compatible with only AVX2 version of GEMM kernels. + * + * Here we force the GEMM kernels to the AVX2 varients for float and double. + * For scomplex and dcomplex reference path is retained as is. + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ \ + if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \ + (dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \ + { \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \ + } \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 0d4e2e0ba..e55b75dff 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -174,6 +174,25 @@ void PASTEMAC(ch,varname) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Zen4 TRSM Fixme: + * + * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels + * for TRSM (Till we implemente TRSM AVX-512 kernels) + * + * The AVX2 kernels for TRSM are enabled in the context, but they + * are compatible with only AVX2 version of GEMM kernels. + * + * Here we force the GEMM kernels to the AVX2 varients for float and double. + * For scomplex and dcomplex reference path is retained as is. + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ \ + if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \ + (dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \ + { \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \ + } \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 396fb4af1..5e070a760 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -180,6 +180,25 @@ void PASTEMAC(ch,varname) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Zen4 TRSM Fixme: + * + * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels + * for TRSM (Till we implemente TRSM AVX-512 kernels) + * + * The AVX2 kernels for TRSM are enabled in the context, but they + * are compatible with only AVX2 version of GEMM kernels. + * + * Here we force the GEMM kernels to the AVX2 varients for float and double. + * For scomplex and dcomplex reference path is retained as is. + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ \ + if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \ + (dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \ + { \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \ + } \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 8b73b702f..b592c2427 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -179,6 +179,25 @@ void PASTEMAC(ch,varname) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Zen4 TRSM Fixme: + * + * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels + * for TRSM (Till we implemente TRSM AVX-512 kernels) + * + * The AVX2 kernels for TRSM are enabled in the context, but they + * are compatible with only AVX2 version of GEMM kernels. + * + * Here we force the GEMM kernels to the AVX2 varients for float and double. + * For scomplex and dcomplex reference path is retained as is. + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ \ + if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \ + (dt == BLIS_FLOAT || dt == BLIS_DOUBLE)) \ + { \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \ + } \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index f3891dbbb..a4d937bc2 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -272,7 +272,7 @@ dim_t bli_determine_blocksize_f b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); - // If b_use != 0, this means that trsm blocksizes are set + // If b_alg != 0, this means that trsm blocksizes are set // and we continue with trsm-specific blocksizes. // Else, we query L3 blocksizes and use them for TRSM execution. if( b_alg > 0 ) return bli_determine_blocksize_f_sub( i, dim, b_alg, b_max); @@ -313,10 +313,10 @@ dim_t bli_determine_blocksize_b b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); - // If b_use != 0, this means that trsm blocksizes are set + // If b_alg != 0, this means that trsm blocksizes are set // and we continue with trsm-specific blocksizes. // Else, we query L3 blocksizes and use them for TRSM execution. - if( b_alg > 0 ) bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); + if( b_alg > 0 ) return bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); } diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index 584c221ba..4e28d8b46 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -802,10 +802,11 @@ typedef enum BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, - BLIS_TRSM_U_UKR + BLIS_TRSM_U_UKR, + BLIS_GEMM_AVX2_UKR } l3ukr_t; -#define BLIS_NUM_LEVEL3_UKRS 5 +#define BLIS_NUM_LEVEL3_UKRS 6 typedef enum diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c index b3916db6a..a0cec45b9 100644 --- a/testsuite/src/test_gemmtrsm_ukr.c +++ b/testsuite/src/test_gemmtrsm_ukr.c @@ -5,7 +5,7 @@ libraries. Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -209,13 +209,32 @@ void libblis_test_gemmtrsm_ukr_experiment // Query a context. cntx = bli_gks_query_cntx(); +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) + /* Zen4 TRSM Fixme: + * + * TRSM and GEMM used different values of MR and NR, we need to ensure that + * Values used for packing are as per the MR and NR values expected by the kernels + * For now this issue exists only for zen4 hence override the values here if + * the family is BLIS_TRSM and architecture is zen4 + * + * We need to override the values here as well as the packing and compute + * kernels are invoked directly from here (instead of BLIS/BLAS call.) + * + * We need to revisit this when TRSM AVX-512 kernels are implemented. + */ + if (bli_arch_query_id() == BLIS_ARCH_ZEN4) + { + bli_zen4_override_trsm_blkszs(cntx); + } +#endif + // Use the datatype of the first char in the datatype combination string. bli_param_map_char_to_blis_dt( dc_str[0], &datatype ); // Map the dimension specifier to actual dimensions. k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur ); - // Fix m and n to MR and NR, respectively. + m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx ); n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx ); @@ -224,6 +243,7 @@ void libblis_test_gemmtrsm_ukr_experiment ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx ); ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx ); + // Store the register blocksizes so that the driver can retrieve the // values later when printing results. op->dim_aux[0] = m; @@ -433,6 +453,7 @@ bli_printm( "ap", &ap, "%5.2f", "" ); bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED ); #endif + // Free the packed objects. bli_obj_free( &ap ); bli_obj_free( &bp ); @@ -442,6 +463,20 @@ bli_printm( "ap", &ap, "%5.2f", "" ); bli_obj_free( &b ); bli_obj_free( &c11 ); bli_obj_free( &c11_save ); + +#if defined(BLIS_FAMILY_AMDZEN) || defined(BLIS_FAMILY_ZEN4) + /* Zen4 TRSM Fixme: + * + * We have overrding the block sizes at the start of this function + * Since the context is created only once we need to ensure that the + * default block sizes are restored for the subsequent operations. + */ + if (bli_arch_query_id() == BLIS_ARCH_ZEN4) + { + bli_zen4_restore_default_blkszs(cntx); + } +#endif + }