From fcc10054a11b6fc3976986f57feccf741596cbf6 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 13 Aug 2014 12:32:06 -0500 Subject: [PATCH] Tweaks to gemm4m, gemm3m virtual ukernels. Details: - Fixed a potential, but as-yet unobserved bug in gemm3m that would allow undesirable inf/NaN propogation, since C was being scaled by beta even if it was equal to zero. - In gemm3m micro-kernel, we now avoid copying C to the temporary micro-tile if beta is zero. - Rearranged computation in gemm4m so that the temporary C micro-tile is accessed less, and C is accessed only after the micro-kernel calls. This improves performance marginally in most situations. - Comment updates to both gemm4m and gemm3m micro-kernels. --- frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c | 22 +++-- frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c | 86 +++++++++---------- 2 files changed, 57 insertions(+), 51 deletions(-) diff --git a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c index 14e3d8c5e..6336a32d8 100644 --- a/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c +++ b/frame/3/gemm/3m/ukernels/bli_gemm3m_ukr_ref.c @@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \ /* Use beta.r == 1.0. */ \ beta_r = *one_r; \ } \ - else \ + else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \ { \ /* Copy c to ct without scaling. */ \ for ( j = 0; j < n; ++j ) \ @@ -138,6 +138,11 @@ void PASTEMAC(ch,varname)( \ *(ct_r + i*rs_ct + j*cs_ct), \ *(ct_i + i*rs_ct + j*cs_ct) ); \ } \ + else \ + { \ + /* Since beta is zero, ct can remain uninitialized since it + will be overwritten by the micro-kernel. */ \ + } \ \ \ /* c.r = beta.r * c.r + a.r * b.r - a.i * b.i; @@ -145,7 +150,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* ab.r = a.r * b.r; */ \ + /* ab.r = alpha.r * a.r * b.r; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_r, \ @@ -156,7 +161,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \ \ - /* ab.i = a.i * b.i; */ \ + /* ab.i = alpha.r * a.i * b.i; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_i, \ @@ -167,7 +172,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* ct.i = a.ri * b.ri; */ \ + /* ct.i = alpha.r * a.ri * b.ri; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_ri, \ @@ -189,7 +194,14 @@ void PASTEMAC(ch,varname)( \ ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \ ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \ \ - PASTEMAC(chr,scals)( beta_r, gammat_r ); \ + if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + PASTEMAC(chr,copys)( *zero_r, gammat_r ); \ + } \ + else \ + { \ + PASTEMAC(chr,scals)( beta_r, gammat_r ); \ + } \ \ PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \ PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \ diff --git a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c index f9f8b1477..e4c171c2f 100644 --- a/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c +++ b/frame/3/gemm/4m/ukernels/bli_gemm4m_ukr_ref.c @@ -76,12 +76,10 @@ void PASTEMAC(ch,varname)( \ const inc_t cs_c2 = 2 * cs_c; \ \ ctype_r* restrict one_r = PASTEMAC(chr,1); \ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ -\ - ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ \ ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \ \ @@ -98,36 +96,6 @@ void PASTEMAC(ch,varname)( \ if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ -\ - /* Copy the contents of c to a temporary buffer ct. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) \ - { \ - /* We can handle a non-zero imaginary component on beta, but to do - so we have to manually scale c and then use beta == 1 for the - micro-kernel calls. */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,scal2ris)( beta_r, \ - beta_i, \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ -\ - /* Use beta.r == 1.0. */ \ - beta_r = *one_r; \ - } \ - else \ - { \ - /* Copy c to ct without scaling. */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2), \ - *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct) ); \ - } \ -\ \ /* c.r = beta.r * c.r + alpha.r * a.r * b.r - alpha.r * a.i * b.i; @@ -136,29 +104,29 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_r, b_i, *data ); \ \ - /* c.r = beta * c.r + a.r * b.r; */ \ + /* ct.r = alpha.r * a.r * b.r; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_r, \ b_r, \ - &beta_r, \ + zero_r, \ ct_r, rs_ct, cs_ct, \ data ); \ \ bli_auxinfo_set_next_ab( a_i, b_r, *data ); \ \ - /* c.i = beta * c.i + a.r * b.i; */ \ + /* ct.i = alpha.r * a.r * b.i; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_r, \ b_i, \ - &beta_r, \ + zero_r, \ ct_i, rs_ct, cs_ct, \ data ); \ \ bli_auxinfo_set_next_ab( a_i, b_i, *data ); \ \ - /* c.i = 1.0 * c.i + a.i * b.r; */ \ + /* ct.i += alpha.r * a.i * b.r; */ \ PASTEMAC(chr,gemmukr)( k, \ &alpha_r, \ a_i, \ @@ -169,7 +137,7 @@ void PASTEMAC(ch,varname)( \ \ bli_auxinfo_set_next_ab( a_next, b_next, *data ); \ \ - /* c.r = 1.0 * c.r - a.i * b.i; */ \ + /* ct.r += -alpha.r * a.i * b.i; */ \ PASTEMAC(chr,gemmukr)( k, \ &m_alpha_r, \ a_i, \ @@ -179,13 +147,39 @@ void PASTEMAC(ch,varname)( \ data ); \ \ \ - /* Copy the final result in ct back to c. */ \ - for ( j = 0; j < n; ++j ) \ - for ( i = 0; i < m; ++i ) \ - PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \ - *(ct_i + i*rs_ct + j*cs_ct), \ - *(c_r + i*rs_c2 + j*cs_c2), \ - *(c_i + i*rs_c2 + j*cs_c2) ); \ + /* Accumulate the final result in ct back to c. */ \ + if ( PASTEMAC(ch,eq1)( *beta ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,addris)( *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct), \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2) ); \ + } \ + else if ( PASTEMAC(ch,eq0)( *beta ) ) \ + { \ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct), \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2) ); \ + } \ + else \ + { \ + ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + for ( j = 0; j < n; ++j ) \ + for ( i = 0; i < m; ++i ) \ + PASTEMAC(ch,xpbyris)( *(ct_r + i*rs_ct + j*cs_ct), \ + *(ct_i + i*rs_ct + j*cs_ct), \ + beta_r, \ + beta_i, \ + *(c_r + i*rs_c2 + j*cs_c2), \ + *(c_i + i*rs_c2 + j*cs_c2) ); \ + } \ } INSERT_GENTFUNCCO_BASIC( gemm4m_ukr_ref, GEMM_UKERNEL )