From c2df5eac1c6a9fe4d750129cf3fbe5fc5b332ee1 Mon Sep 17 00:00:00 2001 From: Nallani Bhaskar Date: Wed, 15 Dec 2021 15:11:08 +0530 Subject: [PATCH] Reduced number of threads in dgemm for small dimensions - Number of threads are reduced to 1 when the dimensions are very low. - Removed uninitialized xmm compilation warning in trsm small Change-Id: I23262fb82729af5b98ded5d36f5eed45d5255d5b --- frame/base/bli_rntm.c | 4 ++++ kernels/zen/3/bli_trsm_small.c | 3 +++ 2 files changed, 7 insertions(+) diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 6a100bbe8..dc0acf6bf 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -574,6 +574,10 @@ void bli_nthreads_optimum( if(n < 15) n_threads_ideal = 1; else n_threads_ideal = 4; } + else if( ( m < 34) && (k < 68) && ( m < 34)) + { + n_threads_ideal = 1; + } else { if(n < 20) n_threads_ideal = 1; diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index c782a08a4..0fa8f66d5 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -2847,6 +2847,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_3N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);\ @@ -3009,6 +3010,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_2N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);\ @@ -3116,6 +3118,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_1N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);