Enabled AVX-512 kernels for Zen4 config

- Enabled AVX-512 skylake kernels in zen4 configuration.
    AVX-512 kernels are added for GEMM float and double types.

  - Enabled reference kernel for TRSM native path

AMD-Internal: [CPUPL-2108]
Change-Id: I66f3468346085c17183cbcbf4f2c8cfe07579b6f
This commit is contained in:
Dipal M Zambare
2022-05-18 11:01:41 +05:30
committed by Dipal M. Zambare
parent e61ec820f9
commit 8cc15107ed
7 changed files with 49 additions and 25 deletions

View File

@@ -73,8 +73,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0

View File

@@ -50,8 +50,6 @@
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
#define AVX512
//#include <stdlib.h>
//#define BLIS_MALLOC_POOL malloc

View File

@@ -47,18 +47,20 @@ void bli_cntx_init_zen4( cntx_t* cntx )
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
4,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
#if 0 // GENOA TODO: TRSM AVX-512 implementation
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
#endif
cntx
);
@@ -160,14 +162,16 @@ void bli_cntx_init_zen4( cntx_t* cntx )
//
// These are reference block sizes and may be overridden based on
// number of threads used at runtime.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 256, 256, 566,
480, 320, 256, 566 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3752, 4080, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
@@ -188,6 +192,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
);
// -------------------------------------------------------------------------
#if 0 // GENOA TODO: TRSM AVX-512 implementation
//Initialize TRSM blocksize objects with architecture-specific values.
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
//Tuning is done for double-precision only.
@@ -208,6 +213,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
// Initialize sup thresholds with architecture-appropriate values. s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );

View File

@@ -4,7 +4,7 @@
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -39,7 +39,6 @@
// Setting these macros to 1 will force JR and IR inner loops
// to be not paralleized.
//
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
@@ -56,4 +55,11 @@
//#define BLIS_ENABLE_FAST_MATH
// -- SIMD config --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 64
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
#endif

View File

@@ -32,7 +32,7 @@
#
#
# FLAGS that are specific to the 'zen3' architecture are added here.
# FLAGS that are specific to the 'zen4' architecture are added here.
# FLAGS that are common for all the AMD architectures are present in
# config/zen/amd_config.mk.
@@ -73,15 +73,17 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
# gcc or clang version must be atleast 4.0
# gcc 9.0 or later:
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
CKVECFLAGS += -march=znver3
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CRVECFLAGS += -march=znver3
else
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
CKVECFLAGS += -march=znver2
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CRVECFLAGS += -march=znver2
else
# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
# as the fallback option.
CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
endif # GCC 9
endif # GCC 11
else
@@ -99,11 +101,13 @@ ifeq ($(CC_VENDOR),clang)
# for version 3x we will enable znver3
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
CKVECFLAGS += -march=znver3
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CRVECFLAGS += -march=znver3
else
# for version 2x we will enable znver2
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
CKVECFLAGS += -march=znver2
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
CRVECFLAGS += -march=znver2
else
#if compiling with clang
VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
@@ -111,8 +115,10 @@ CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
#clang 9.0 or later:
ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
CKVECFLAGS += -march=znver2
CRVECFLAGS += -march=znver2
else
CKVECFLAGS += -march=znver1
CRVECFLAGS += -march=znver1
endif # ge 9
endif # aocc 2
endif # aocc 3
@@ -121,7 +127,12 @@ endif # gcc
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Flags specific to reference kernels.
# Note: We use AVX2 for reference kernels because, as Jeff Hammond says,
# reference kernel code "is not going to achieve high enough SIMD utilization
# to overcome the AVX-512 frequency drop". (Issue #187)
CRVECFLAGS += -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
# Store all of the variables here to new variables containing the
# configuration name.

View File

@@ -26,7 +26,7 @@ sandybridge: sandybridge
penryn: penryn
# AMD architectures.
zen4: zen4/zen4/zen3/zen2/zen/haswell
zen4: zen4/zen4/skx/zen3/zen2/zen/haswell
zen3: zen3/zen3/zen2/zen/haswell
zen2: zen2/zen2/zen/haswell
zen: zen/zen/haswell

View File

@@ -6,7 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -171,6 +171,9 @@ CNTX_INIT_PROTS( generic )
// -- AMD64 architectures --
#ifdef BLIS_FAMILY_ZEN4
#include "bli_family_zen4.h"
#endif
#ifdef BLIS_FAMILY_ZEN3
#include "bli_family_zen3.h"
#endif