From 8cc15107ede971e432718f58ebc8d323a9a6b4e3 Mon Sep 17 00:00:00 2001 From: Dipal M Zambare Date: Wed, 18 May 2022 11:01:41 +0530 Subject: [PATCH] Enabled AVX-512 kernels for Zen4 config - Enabled AVX-512 skylake kernels in zen4 configuration. AVX-512 kernels are added for GEMM float and double types. - Enabled reference kernel for TRSM native path AMD-Internal: [CPUPL-2108] Change-Id: I66f3468346085c17183cbcbf4f2c8cfe07579b6f --- config/skx/bli_cntx_init_skx.c | 4 ++-- config/skx/bli_family_skx.h | 2 -- config/zen4/bli_cntx_init_zen4.c | 26 ++++++++++++++++---------- config/zen4/bli_family_zen4.h | 10 ++++++++-- config/zen4/make_defs.mk | 25 ++++++++++++++++++------- config_registry | 2 +- frame/include/bli_arch_config.h | 5 ++++- 7 files changed, 49 insertions(+), 25 deletions(-) diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index c14311bf2..f18503a7a 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -73,8 +73,8 @@ void bli_cntx_init_skx( cntx_t* cntx ) 10, #if 1 // amaxv - BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512, - BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512, + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif // axpyv #if 0 diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h index cbba06358..ac9478f8b 100644 --- a/config/skx/bli_family_skx.h +++ b/config/skx/bli_family_skx.h @@ -50,8 +50,6 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 -#define AVX512 - //#include //#define BLIS_MALLOC_POOL malloc diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c index c340fa908..e25ceabc8 100644 --- a/config/zen4/bli_cntx_init_zen4.c +++ b/config/zen4/bli_cntx_init_zen4.c @@ -47,18 +47,20 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 8, + 4, // gemm - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, +#if 0 // GENOA TODO: TRSM AVX-512 implementation // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, +#endif cntx ); @@ -160,14 +162,16 @@ void bli_cntx_init_zen4( cntx_t* cntx ) // // These are reference block sizes and may be overridden based on // number of threads used at runtime. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 ); - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 ); + bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 256, 256, 566, + 480, 320, 256, 566 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3752, 4080, 256 ); + + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache @@ -188,6 +192,7 @@ void bli_cntx_init_zen4( cntx_t* cntx ) ); // ------------------------------------------------------------------------- +#if 0 // GENOA TODO: TRSM AVX-512 implementation //Initialize TRSM blocksize objects with architecture-specific values. //Using different cache block sizes for TRSM instead of common level-3 block sizes. //Tuning is done for double-precision only. @@ -208,6 +213,7 @@ void bli_cntx_init_zen4( cntx_t* cntx ) BLIS_MR, &blkszs[ BLIS_MR ], cntx ); +#endif // Initialize sup thresholds with architecture-appropriate values. s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 ); diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h index 9c70fcef8..71929cdac 100644 --- a/config/zen4/bli_family_zen4.h +++ b/config/zen4/bli_family_zen4.h @@ -4,7 +4,7 @@ An object-based framework for developing high-performance BLAS-like libraries. - Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -39,7 +39,6 @@ // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. // - #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 @@ -56,4 +55,11 @@ //#define BLIS_ENABLE_FAST_MATH +// -- SIMD config -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 64 + +#define BLIS_SIMD_SIZE 64 +#define BLIS_SIMD_NUM_REGISTERS 32 + #endif diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk index 44e96bb0c..85a8a39f6 100644 --- a/config/zen4/make_defs.mk +++ b/config/zen4/make_defs.mk @@ -32,7 +32,7 @@ # # -# FLAGS that are specific to the 'zen3' architecture are added here. +# FLAGS that are specific to the 'zen4' architecture are added here. # FLAGS that are common for all the AMD architectures are present in # config/zen/amd_config.mk. @@ -73,15 +73,17 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1)) # gcc or clang version must be atleast 4.0 # gcc 9.0 or later: ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0) -CKVECFLAGS += -march=znver3 +CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CRVECFLAGS += -march=znver3 else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0) -CKVECFLAGS += -march=znver2 +CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CRVECFLAGS += -march=znver2 else # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1 # as the fallback option. -CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store +CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store endif # GCC 9 endif # GCC 11 else @@ -99,11 +101,13 @@ ifeq ($(CC_VENDOR),clang) # for version 3x we will enable znver3 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1) -CKVECFLAGS += -march=znver3 +CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CRVECFLAGS += -march=znver3 else # for version 2x we will enable znver2 ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1) -CKVECFLAGS += -march=znver2 +CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse +CRVECFLAGS += -march=znver2 else #if compiling with clang VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*')) @@ -111,8 +115,10 @@ CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1)) #clang 9.0 or later: ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0) CKVECFLAGS += -march=znver2 +CRVECFLAGS += -march=znver2 else CKVECFLAGS += -march=znver1 +CRVECFLAGS += -march=znver1 endif # ge 9 endif # aocc 2 endif # aocc 3 @@ -121,7 +127,12 @@ endif # gcc # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) -CRVECFLAGS := $(CKVECFLAGS) + +# Flags specific to reference kernels. +# Note: We use AVX2 for reference kernels because, as Jeff Hammond says, +# reference kernel code "is not going to achieve high enough SIMD utilization +# to overcome the AVX-512 frequency drop". (Issue #187) +CRVECFLAGS += -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast # Store all of the variables here to new variables containing the # configuration name. diff --git a/config_registry b/config_registry index 822b133f5..4e6716dfa 100644 --- a/config_registry +++ b/config_registry @@ -26,7 +26,7 @@ sandybridge: sandybridge penryn: penryn # AMD architectures. -zen4: zen4/zen4/zen3/zen2/zen/haswell +zen4: zen4/zen4/skx/zen3/zen2/zen/haswell zen3: zen3/zen3/zen2/zen/haswell zen2: zen2/zen2/zen/haswell zen: zen/zen/haswell diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 3e2e0b022..6343c6ba8 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -6,7 +6,7 @@ Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP - Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved. + Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -171,6 +171,9 @@ CNTX_INIT_PROTS( generic ) // -- AMD64 architectures -- +#ifdef BLIS_FAMILY_ZEN4 +#include "bli_family_zen4.h" +#endif #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" #endif