mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
GCC compiler optimization flag (kernel) update for zen3 and zen4 config.
-Inefficient assembly is generated for s16 gemm micro-kernel(intrinsics code) when compiled using gcc. The presence of -fschedule-insns + -fschedule-insns2 + -ftree-pre in O2 compiler optimization flags results in the code being optimized to reduce data stalls, and results in the usage of stack to store intermediate C register output. Disabling -ftree-pre in gcc fixes the issue, even in the presence of the other two flags. AMD-Internal: [CPUPL-2971] Change-Id: Ibf0dcde20b5a18708a05faad34e684eb0a9a5463
This commit is contained in:
committed by
MithunMohan KadavilMadanaMohanan
parent
672544bc04
commit
d2713d3dc0
@@ -75,9 +75,16 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
|
||||
# gcc 9.0 or later:
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver3
|
||||
# Update CKOPTFLAGS for gcc to use O3 optimization without
|
||||
# -ftree-pre and -ftree-partial-pre flag. These flag results
|
||||
# in suboptimal code gen for instrinsic based kernels.
|
||||
# The -ftree-loop-vectorize results in ineffecient code gen
|
||||
# for amd optimized l1 kernels based on instrinsics.
|
||||
CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
|
||||
else
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2
|
||||
CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
|
||||
else
|
||||
# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
|
||||
# as the fallback option.
|
||||
|
||||
@@ -74,20 +74,20 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
|
||||
|
||||
# gcc 11.0 or later:
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
|
||||
# Update CKOPTFLAGS for gcc 11+ to use O3 optimization without
|
||||
# -ftree-partial-pre flag. This flag results in suboptimal code
|
||||
# generation for instrinsics based kernels.
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CKOPTFLAGS := -O2 -fgcse-after-reload -fipa-cp-clone -floop-interchange -floop-unroll-and-jam -fpeel-loops -fpredictive-commoning -fsplit-loops -fsplit-paths -ftree-loop-distribution -funswitch-loops -fvect-cost-model=dynamic -fversion-loops-for-strides -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver3
|
||||
# Update CKOPTFLAGS for gcc to use O3 optimization without
|
||||
# -ftree-pre and -ftree-partial-pre flag. These flag results
|
||||
# in suboptimal code generation for instrinsic based kernels.
|
||||
# The -ftree-loop-vectorize results in ineffecient code gen
|
||||
# for amd optimized l1 kernels based on instrinsics.
|
||||
CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
|
||||
else
|
||||
# gcc 9.0 or later:
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
CRVECFLAGS += -march=znver2
|
||||
CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
|
||||
else
|
||||
ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0)
|
||||
CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
|
||||
|
||||
Reference in New Issue
Block a user