diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 6a100bbe8..dc0acf6bf 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -574,6 +574,10 @@ void bli_nthreads_optimum( if(n < 15) n_threads_ideal = 1; else n_threads_ideal = 4; } + else if( ( m < 34) && (k < 68) && ( m < 34)) + { + n_threads_ideal = 1; + } else { if(n < 20) n_threads_ideal = 1; diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c index c782a08a4..0fa8f66d5 100644 --- a/kernels/zen/3/bli_trsm_small.c +++ b/kernels/zen/3/bli_trsm_small.c @@ -2847,6 +2847,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_3N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);\ @@ -3009,6 +3010,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_2N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);\ @@ -3116,6 +3118,7 @@ BLIS_INLINE err_t dtrsm_XAltB_ref #define BLIS_PRE_STRSM_SMALL_1N_2M(AlphaVal,b11,cs_b)\ ymm15 = _mm256_broadcast_ss((float const *)&AlphaVal); /*register to hold alpha*/\ \ + xmm5 = _mm_setzero_ps();\ xmm5 = _mm_loadl_pi(xmm5,(__m64*)(b11));\ ymm6 = _mm256_insertf128_ps(ymm0, xmm5, 0);\ ymm3 = _mm256_fmsub_ps(ymm6, ymm15, ymm3);