mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Enabled AVX-512 kernels for Zen4 config
- Enabled AVX-512 skylake kernels in zen4 configuration.
AVX-512 kernels are added for GEMM float and double types.
- Enabled reference kernel for TRSM native path
AMD-Internal: [CPUPL-2108]
Change-Id: I66f3468346085c17183cbcbf4f2c8cfe07579b6f
This commit is contained in:
committed by
Dipal M. Zambare
parent
e61ec820f9
commit
8cc15107ed
@@ -73,8 +73,8 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
10,
|
||||
#if 1
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int_avx512,
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
#endif
|
||||
// axpyv
|
||||
#if 0
|
||||
|
||||
@@ -50,8 +50,6 @@
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
#define AVX512
|
||||
|
||||
//#include <stdlib.h>
|
||||
|
||||
//#define BLIS_MALLOC_POOL malloc
|
||||
|
||||
@@ -47,18 +47,20 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
8,
|
||||
4,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
|
||||
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
|
||||
#if 0 // GENOA TODO: TRSM AVX-512 implementation
|
||||
// gemmtrsm_l
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
|
||||
// gemmtrsm_u
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
|
||||
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -160,14 +162,16 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
//
|
||||
// These are reference block sizes and may be overridden based on
|
||||
// number of threads used at runtime.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 18 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 566 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 256 );
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 18 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 256, 256, 566,
|
||||
480, 320, 256, 566 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3752, 4080, 256 );
|
||||
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
@@ -188,6 +192,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
);
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
#if 0 // GENOA TODO: TRSM AVX-512 implementation
|
||||
//Initialize TRSM blocksize objects with architecture-specific values.
|
||||
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
|
||||
//Tuning is done for double-precision only.
|
||||
@@ -208,6 +213,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
|
||||
BLIS_MR, &blkszs[ BLIS_MR ],
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Initialize sup thresholds with architecture-appropriate values. s d c z
|
||||
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, 380, 110 );
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -39,7 +39,6 @@
|
||||
// Setting these macros to 1 will force JR and IR inner loops
|
||||
// to be not paralleized.
|
||||
//
|
||||
|
||||
#define BLIS_THREAD_MAX_IR 1
|
||||
#define BLIS_THREAD_MAX_JR 1
|
||||
|
||||
@@ -56,4 +55,11 @@
|
||||
|
||||
//#define BLIS_ENABLE_FAST_MATH
|
||||
|
||||
// -- SIMD config --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 64
|
||||
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
#endif
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
#
|
||||
#
|
||||
|
||||
# FLAGS that are specific to the 'zen3' architecture are added here.
|
||||
# FLAGS that are specific to the 'zen4' architecture are added here.
|
||||
# FLAGS that are common for all the AMD architectures are present in
|
||||
# config/zen/amd_config.mk.
|
||||
|
||||
@@ -73,15 +73,17 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
|
||||
# gcc or clang version must be atleast 4.0
|
||||
# gcc 9.0 or later:
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver3
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver3
|
||||
else
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver2
|
||||
else
|
||||
# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
|
||||
# as the fallback option.
|
||||
CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
|
||||
CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
|
||||
CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
|
||||
endif # GCC 9
|
||||
endif # GCC 11
|
||||
else
|
||||
@@ -99,11 +101,13 @@ ifeq ($(CC_VENDOR),clang)
|
||||
|
||||
# for version 3x we will enable znver3
|
||||
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
|
||||
CKVECFLAGS += -march=znver3
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver3
|
||||
else
|
||||
# for version 2x we will enable znver2
|
||||
ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
|
||||
CKVECFLAGS += -march=znver2
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver2
|
||||
else
|
||||
#if compiling with clang
|
||||
VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
|
||||
@@ -111,8 +115,10 @@ CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
|
||||
#clang 9.0 or later:
|
||||
ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2
|
||||
CRVECFLAGS += -march=znver2
|
||||
else
|
||||
CKVECFLAGS += -march=znver1
|
||||
CRVECFLAGS += -march=znver1
|
||||
endif # ge 9
|
||||
endif # aocc 2
|
||||
endif # aocc 3
|
||||
@@ -121,7 +127,12 @@ endif # gcc
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
CROPTFLAGS := $(CKOPTFLAGS)
|
||||
CRVECFLAGS := $(CKVECFLAGS)
|
||||
|
||||
# Flags specific to reference kernels.
|
||||
# Note: We use AVX2 for reference kernels because, as Jeff Hammond says,
|
||||
# reference kernel code "is not going to achieve high enough SIMD utilization
|
||||
# to overcome the AVX-512 frequency drop". (Issue #187)
|
||||
CRVECFLAGS += -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
|
||||
@@ -26,7 +26,7 @@ sandybridge: sandybridge
|
||||
penryn: penryn
|
||||
|
||||
# AMD architectures.
|
||||
zen4: zen4/zen4/zen3/zen2/zen/haswell
|
||||
zen4: zen4/zen4/skx/zen3/zen2/zen/haswell
|
||||
zen3: zen3/zen3/zen2/zen/haswell
|
||||
zen2: zen2/zen2/zen/haswell
|
||||
zen: zen/zen/haswell
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
|
||||
Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -171,6 +171,9 @@ CNTX_INIT_PROTS( generic )
|
||||
|
||||
// -- AMD64 architectures --
|
||||
|
||||
#ifdef BLIS_FAMILY_ZEN4
|
||||
#include "bli_family_zen4.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_ZEN3
|
||||
#include "bli_family_zen3.h"
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user