diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c index 557b3f202..044cdf9bb 100644 --- a/frame/compat/bla_gemm.c +++ b/frame/compat/bla_gemm.c @@ -666,8 +666,7 @@ void zgemm_ sqp_on = true; } #endif - if( ( ( blis_transa == BLIS_TRANSPOSE ) || ( blis_transa == BLIS_NO_TRANSPOSE ) ) - && ( blis_transb == BLIS_NO_TRANSPOSE) && (sqp_on == true)) + if( ( blis_transb == BLIS_NO_TRANSPOSE) && ( sqp_on == true ) ) { //sqp algo is found better for n > 40 if(bli_gemm_sqp(&alphao, &ao, &bo, &betao, &co, NULL, NULL)==BLIS_SUCCESS) diff --git a/kernels/zen/3/bli_gemm_sqp.c b/kernels/zen/3/bli_gemm_sqp.c index 1622c551c..ceab622bf 100644 --- a/kernels/zen/3/bli_gemm_sqp.c +++ b/kernels/zen/3/bli_gemm_sqp.c @@ -40,16 +40,31 @@ #define BLIS_LOADFIRST 0 #define MEM_ALLOC 1//malloc performs better than bli_malloc. +#define SET_TRANS(X,Y)\ + Y = BLIS_NO_TRANSPOSE;\ + if(bli_obj_has_trans( a ))\ + {\ + Y = BLIS_TRANSPOSE;\ + if(bli_obj_has_conj(a))\ + {\ + Y = BLIS_CONJ_TRANSPOSE;\ + }\ + }\ + else if(bli_obj_has_conj(a))\ + {\ + Y = BLIS_CONJ_NO_TRANSPOSE;\ + } + //Macro for 3m_sqp n loop #define BLI_SQP_ZGEMM_N(MX)\ int j=0;\ for(; j<=(n-nx); j+= nx)\ {\ - status = bli_sqp_zgemm_m8( m, nx, k, a, lda, b+(j*ldb), ldb, c+(j*ldc), ldc, alpha_real, beta_real, isTransA, MX, p_istart, kx, &mem_3m_sqp);\ + status = bli_sqp_zgemm_m8( m, nx, k, a, lda, b+(j*ldb), ldb, c+(j*ldc), ldc, alpha_real, beta_real, transa, MX, p_istart, kx, &mem_3m_sqp);\ }\ if(j