GCC compiler optimization flag (kernel) update for zen3 and zen4 config.

-Inefficient assembly is generated for s16 gemm micro-kernel(intrinsics code) when compiled using gcc. The presence of -fschedule-insns + -fschedule-insns2 + -ftree-pre in O2 compiler optimization flags results in the code being optimized to reduce data stalls, and results in the usage of stack to store intermediate C register output. Disabling -ftree-pre in gcc fixes the issue, even in the presence of the other two flags. AMD-Internal: [CPUPL-2971] Change-Id: Ibf0dcde20b5a18708a05faad34e684eb0a9a5463
2026-05-11 09:39:59 +00:00 · 2023-01-30 19:10:36 +05:30
parent 672544bc04
commit d2713d3dc0
2 changed files with 14 additions and 7 deletions
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -75,9 +75,16 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
 # gcc 9.0 or later:
 ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
 CKVECFLAGS     += -march=znver3
+# Update CKOPTFLAGS for gcc to use O3 optimization without
+# -ftree-pre and -ftree-partial-pre flag. These flag results
+# in suboptimal code gen for instrinsic based kernels.
+# The -ftree-loop-vectorize results in ineffecient code gen
+# for amd optimized l1 kernels based on instrinsics.
+CKOPTFLAGS     += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
 else
 ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
 CKVECFLAGS     += -march=znver2
+CKOPTFLAGS     += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
 else
 # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
 # as the fallback option.
--- a/config/zen4/make_defs.mk
+++ b/config/zen4/make_defs.mk
@@ -74,20 +74,20 @@ GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))

 # gcc 11.0 or later:
 ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
-# Update CKOPTFLAGS for gcc 11+ to use O3 optimization without
-# -ftree-partial-pre flag. This flag results in suboptimal code
-# generation for instrinsics based kernels.
-ifneq ($(DEBUG_TYPE),noopt)
-CKOPTFLAGS     := -O2 -fgcse-after-reload -fipa-cp-clone -floop-interchange -floop-unroll-and-jam -fpeel-loops -fpredictive-commoning -fsplit-loops -fsplit-paths -ftree-loop-distribution -funswitch-loops -fvect-cost-model=dynamic -fversion-loops-for-strides -fomit-frame-pointer
-endif
-
 CKVECFLAGS     +=  -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse
 CRVECFLAGS     +=  -march=znver3
+# Update CKOPTFLAGS for gcc to use O3 optimization without
+# -ftree-pre and -ftree-partial-pre flag. These flag results
+# in suboptimal code generation for instrinsic based kernels.
+# The -ftree-loop-vectorize results in ineffecient code gen
+# for amd optimized l1 kernels based on instrinsics.
+CKOPTFLAGS     += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
 else
 # gcc 9.0 or later:
 ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
 CKVECFLAGS     +=  -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
 CRVECFLAGS     +=  -march=znver2
+CKOPTFLAGS     += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
 else
 ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0)
 CKVECFLAGS     +=  -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse