Tweaks to gemm4m, gemm3m virtual ukernels.

Details:
- Fixed a potential, but as-yet unobserved bug in gemm3m that would
  allow undesirable inf/NaN propogation, since C was being scaled by
  beta even if it was equal to zero.
- In gemm3m micro-kernel, we now avoid copying C to the temporary
  micro-tile if beta is zero.
- Rearranged computation in gemm4m so that the temporary C micro-tile
  is accessed less, and C is accessed only after the micro-kernel
  calls. This improves performance marginally in most situations.
- Comment updates to both gemm4m and gemm3m micro-kernels.
This commit is contained in:
Field G. Van Zee
2014-08-13 12:32:06 -05:00
parent cdcbacc2fa
commit fcc10054a1
2 changed files with 57 additions and 51 deletions

View File

@@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \
/* Use beta.r == 1.0. */ \
beta_r = *one_r; \
} \
else \
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
{ \
/* Copy c to ct without scaling. */ \
for ( j = 0; j < n; ++j ) \
@@ -138,6 +138,11 @@ void PASTEMAC(ch,varname)( \
*(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct) ); \
} \
else \
{ \
/* Since beta is zero, ct can remain uninitialized since it
will be overwritten by the micro-kernel. */ \
} \
\
\
/* c.r = beta.r * c.r + a.r * b.r - a.i * b.i;
@@ -145,7 +150,7 @@ void PASTEMAC(ch,varname)( \
\
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
\
/* ab.r = a.r * b.r; */ \
/* ab.r = alpha.r * a.r * b.r; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_r, \
@@ -156,7 +161,7 @@ void PASTEMAC(ch,varname)( \
\
bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \
\
/* ab.i = a.i * b.i; */ \
/* ab.i = alpha.r * a.i * b.i; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_i, \
@@ -167,7 +172,7 @@ void PASTEMAC(ch,varname)( \
\
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
\
/* ct.i = a.ri * b.ri; */ \
/* ct.i = alpha.r * a.ri * b.ri; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_ri, \
@@ -189,7 +194,14 @@ void PASTEMAC(ch,varname)( \
ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \
ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \
\
PASTEMAC(chr,scals)( beta_r, gammat_r ); \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC(chr,copys)( *zero_r, gammat_r ); \
} \
else \
{ \
PASTEMAC(chr,scals)( beta_r, gammat_r ); \
} \
\
PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \
PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \

View File

@@ -76,12 +76,10 @@ void PASTEMAC(ch,varname)( \
const inc_t cs_c2 = 2 * cs_c; \
\
ctype_r* restrict one_r = PASTEMAC(chr,1); \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
\
ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \
ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \
\
ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
\
ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \
\
@@ -98,36 +96,6 @@ void PASTEMAC(ch,varname)( \
if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
\
/* Copy the contents of c to a temporary buffer ct. */ \
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
{ \
/* We can handle a non-zero imaginary component on beta, but to do
so we have to manually scale c and then use beta == 1 for the
micro-kernel calls. */ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,scal2ris)( beta_r, \
beta_i, \
*(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2), \
*(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct) ); \
\
/* Use beta.r == 1.0. */ \
beta_r = *one_r; \
} \
else \
{ \
/* Copy c to ct without scaling. */ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2), \
*(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct) ); \
} \
\
\
/* c.r = beta.r * c.r + alpha.r * a.r * b.r
- alpha.r * a.i * b.i;
@@ -136,29 +104,29 @@ void PASTEMAC(ch,varname)( \
\
bli_auxinfo_set_next_ab( a_r, b_i, *data ); \
\
/* c.r = beta * c.r + a.r * b.r; */ \
/* ct.r = alpha.r * a.r * b.r; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_r, \
b_r, \
&beta_r, \
zero_r, \
ct_r, rs_ct, cs_ct, \
data ); \
\
bli_auxinfo_set_next_ab( a_i, b_r, *data ); \
\
/* c.i = beta * c.i + a.r * b.i; */ \
/* ct.i = alpha.r * a.r * b.i; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_r, \
b_i, \
&beta_r, \
zero_r, \
ct_i, rs_ct, cs_ct, \
data ); \
\
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
\
/* c.i = 1.0 * c.i + a.i * b.r; */ \
/* ct.i += alpha.r * a.i * b.r; */ \
PASTEMAC(chr,gemmukr)( k, \
&alpha_r, \
a_i, \
@@ -169,7 +137,7 @@ void PASTEMAC(ch,varname)( \
\
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
\
/* c.r = 1.0 * c.r - a.i * b.i; */ \
/* ct.r += -alpha.r * a.i * b.i; */ \
PASTEMAC(chr,gemmukr)( k, \
&m_alpha_r, \
a_i, \
@@ -179,13 +147,39 @@ void PASTEMAC(ch,varname)( \
data ); \
\
\
/* Copy the final result in ct back to c. */ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct), \
*(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2) ); \
/* Accumulate the final result in ct back to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,addris)( *(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct), \
*(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2) ); \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct), \
*(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2) ); \
} \
else \
{ \
ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
\
for ( j = 0; j < n; ++j ) \
for ( i = 0; i < m; ++i ) \
PASTEMAC(ch,xpbyris)( *(ct_r + i*rs_ct + j*cs_ct), \
*(ct_i + i*rs_ct + j*cs_ct), \
beta_r, \
beta_i, \
*(c_r + i*rs_c2 + j*cs_c2), \
*(c_i + i*rs_c2 + j*cs_c2) ); \
} \
}
INSERT_GENTFUNCCO_BASIC( gemm4m_ukr_ref, GEMM_UKERNEL )