mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Tweaks to gemm4m, gemm3m virtual ukernels.
Details: - Fixed a potential, but as-yet unobserved bug in gemm3m that would allow undesirable inf/NaN propogation, since C was being scaled by beta even if it was equal to zero. - In gemm3m micro-kernel, we now avoid copying C to the temporary micro-tile if beta is zero. - Rearranged computation in gemm4m so that the temporary C micro-tile is accessed less, and C is accessed only after the micro-kernel calls. This improves performance marginally in most situations. - Comment updates to both gemm4m and gemm3m micro-kernels.
This commit is contained in:
@@ -128,7 +128,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Use beta.r == 1.0. */ \
|
||||
beta_r = *one_r; \
|
||||
} \
|
||||
else \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* Copy c to ct without scaling. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
@@ -138,6 +138,11 @@ void PASTEMAC(ch,varname)( \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Since beta is zero, ct can remain uninitialized since it
|
||||
will be overwritten by the micro-kernel. */ \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* c.r = beta.r * c.r + a.r * b.r - a.i * b.i;
|
||||
@@ -145,7 +150,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
|
||||
\
|
||||
/* ab.r = a.r * b.r; */ \
|
||||
/* ab.r = alpha.r * a.r * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_r, \
|
||||
@@ -156,7 +161,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \
|
||||
\
|
||||
/* ab.i = a.i * b.i; */ \
|
||||
/* ab.i = alpha.r * a.i * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_i, \
|
||||
@@ -167,7 +172,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
|
||||
\
|
||||
/* ct.i = a.ri * b.ri; */ \
|
||||
/* ct.i = alpha.r * a.ri * b.ri; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_ri, \
|
||||
@@ -189,7 +194,14 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \
|
||||
ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \
|
||||
\
|
||||
PASTEMAC(chr,scals)( beta_r, gammat_r ); \
|
||||
if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
PASTEMAC(chr,copys)( *zero_r, gammat_r ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(chr,scals)( beta_r, gammat_r ); \
|
||||
} \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \
|
||||
PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \
|
||||
|
||||
@@ -76,12 +76,10 @@ void PASTEMAC(ch,varname)( \
|
||||
const inc_t cs_c2 = 2 * cs_c; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \
|
||||
\
|
||||
@@ -98,36 +96,6 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Copy the contents of c to a temporary buffer ct. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* We can handle a non-zero imaginary component on beta, but to do
|
||||
so we have to manually scale c and then use beta == 1 for the
|
||||
micro-kernel calls. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,scal2ris)( beta_r, \
|
||||
beta_i, \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
\
|
||||
/* Use beta.r == 1.0. */ \
|
||||
beta_r = *one_r; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy c to ct without scaling. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* c.r = beta.r * c.r + alpha.r * a.r * b.r
|
||||
- alpha.r * a.i * b.i;
|
||||
@@ -136,29 +104,29 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_r, b_i, *data ); \
|
||||
\
|
||||
/* c.r = beta * c.r + a.r * b.r; */ \
|
||||
/* ct.r = alpha.r * a.r * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
&beta_r, \
|
||||
zero_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_r, *data ); \
|
||||
\
|
||||
/* c.i = beta * c.i + a.r * b.i; */ \
|
||||
/* ct.i = alpha.r * a.r * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_r, \
|
||||
b_i, \
|
||||
&beta_r, \
|
||||
zero_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
|
||||
\
|
||||
/* c.i = 1.0 * c.i + a.i * b.r; */ \
|
||||
/* ct.i += alpha.r * a.i * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&alpha_r, \
|
||||
a_i, \
|
||||
@@ -169,7 +137,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
|
||||
\
|
||||
/* c.r = 1.0 * c.r - a.i * b.i; */ \
|
||||
/* ct.r += -alpha.r * a.i * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&m_alpha_r, \
|
||||
a_i, \
|
||||
@@ -179,13 +147,39 @@ void PASTEMAC(ch,varname)( \
|
||||
data ); \
|
||||
\
|
||||
\
|
||||
/* Copy the final result in ct back to c. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct), \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
/* Accumulate the final result in ct back to c. */ \
|
||||
if ( PASTEMAC(ch,eq1)( *beta ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,addris)( *(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct), \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
} \
|
||||
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct), \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,xpbyris)( *(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct), \
|
||||
beta_r, \
|
||||
beta_i, \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( gemm4m_ukr_ref, GEMM_UKERNEL )
|
||||
|
||||
Reference in New Issue
Block a user