mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Fixed bug in reference gemm ukernels.
Details: - Fixed a bug whereby, for the reference gemm ukernels, the matrix product was not correctly accumulated and scaled (by alpha) into the output matrix C. (Thanks to Fran for finding this bug.) - Whitespace changes to reference trsm kernels.
This commit is contained in:
@@ -121,15 +121,15 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,scals)( *beta, *c21 ); \
|
||||
PASTEMAC(ch,scals)( *beta, *c31 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c00, *alpha, ab00 ); \
|
||||
PASTEMAC(ch,dots)( *c10, *alpha, ab10 ); \
|
||||
PASTEMAC(ch,dots)( *c20, *alpha, ab20 ); \
|
||||
PASTEMAC(ch,dots)( *c30, *alpha, ab30 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab00, *c00 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab10, *c10 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab20, *c20 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab30, *c30 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c01, *alpha, ab01 ); \
|
||||
PASTEMAC(ch,dots)( *c11, *alpha, ab11 ); \
|
||||
PASTEMAC(ch,dots)( *c21, *alpha, ab21 ); \
|
||||
PASTEMAC(ch,dots)( *c31, *alpha, ab31 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab01, *c01 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab11, *c11 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab21, *c21 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab31, *c31 ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ref_4x2, gemm_ref_4x2 )
|
||||
|
||||
@@ -163,25 +163,25 @@ void PASTEMAC(ch,varname)( \
|
||||
PASTEMAC(ch,scals)( *beta, *c23 ); \
|
||||
PASTEMAC(ch,scals)( *beta, *c33 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c00, *alpha, ab00 ); \
|
||||
PASTEMAC(ch,dots)( *c10, *alpha, ab10 ); \
|
||||
PASTEMAC(ch,dots)( *c20, *alpha, ab20 ); \
|
||||
PASTEMAC(ch,dots)( *c30, *alpha, ab30 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab00, *c00 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab10, *c10 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab20, *c20 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab30, *c30 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c01, *alpha, ab01 ); \
|
||||
PASTEMAC(ch,dots)( *c11, *alpha, ab11 ); \
|
||||
PASTEMAC(ch,dots)( *c21, *alpha, ab21 ); \
|
||||
PASTEMAC(ch,dots)( *c31, *alpha, ab31 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab01, *c01 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab11, *c11 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab21, *c21 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab31, *c31 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c02, *alpha, ab02 ); \
|
||||
PASTEMAC(ch,dots)( *c12, *alpha, ab12 ); \
|
||||
PASTEMAC(ch,dots)( *c22, *alpha, ab22 ); \
|
||||
PASTEMAC(ch,dots)( *c32, *alpha, ab32 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab02, *c02 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab12, *c12 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab22, *c22 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab32, *c32 ); \
|
||||
\
|
||||
PASTEMAC(ch,dots)( *c03, *alpha, ab03 ); \
|
||||
PASTEMAC(ch,dots)( *c13, *alpha, ab13 ); \
|
||||
PASTEMAC(ch,dots)( *c23, *alpha, ab23 ); \
|
||||
PASTEMAC(ch,dots)( *c33, *alpha, ab33 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab03, *c03 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab13, *c13 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab23, *c23 ); \
|
||||
PASTEMAC(ch,dots)( *alpha, ab33, *c33 ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm_ref_4x4, gemm_ref_4x4 )
|
||||
|
||||
@@ -68,14 +68,17 @@ void PASTEMAC(ch,varname)( \
|
||||
b01 = *(b + 0*rs_b + 1*cs_b); \
|
||||
b02 = *(b + 0*rs_b + 2*cs_b); \
|
||||
b03 = *(b + 0*rs_b + 3*cs_b); \
|
||||
\
|
||||
b10 = *(b + 1*rs_b + 0*cs_b); \
|
||||
b11 = *(b + 1*rs_b + 1*cs_b); \
|
||||
b12 = *(b + 1*rs_b + 2*cs_b); \
|
||||
b13 = *(b + 1*rs_b + 3*cs_b); \
|
||||
\
|
||||
b20 = *(b + 2*rs_b + 0*cs_b); \
|
||||
b21 = *(b + 2*rs_b + 1*cs_b); \
|
||||
b22 = *(b + 2*rs_b + 2*cs_b); \
|
||||
b23 = *(b + 2*rs_b + 3*cs_b); \
|
||||
\
|
||||
b30 = *(b + 3*rs_b + 0*cs_b); \
|
||||
b31 = *(b + 3*rs_b + 1*cs_b); \
|
||||
b32 = *(b + 3*rs_b + 2*cs_b); \
|
||||
@@ -95,6 +98,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 0*rs_b + 1*cs_b) = b01; \
|
||||
*(b + 0*rs_b + 2*cs_b) = b02; \
|
||||
*(b + 0*rs_b + 3*cs_b) = b03; \
|
||||
\
|
||||
*(c + 0*rs_c + 0*cs_c) = b00; \
|
||||
*(c + 0*rs_c + 1*cs_c) = b01; \
|
||||
*(c + 0*rs_c + 2*cs_c) = b02; \
|
||||
@@ -120,6 +124,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 1*rs_b + 1*cs_b) = b11; \
|
||||
*(b + 1*rs_b + 2*cs_b) = b12; \
|
||||
*(b + 1*rs_b + 3*cs_b) = b13; \
|
||||
\
|
||||
*(c + 1*rs_c + 0*cs_c) = b10; \
|
||||
*(c + 1*rs_c + 1*cs_c) = b11; \
|
||||
*(c + 1*rs_c + 2*cs_c) = b12; \
|
||||
@@ -151,6 +156,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 2*rs_b + 1*cs_b) = b21; \
|
||||
*(b + 2*rs_b + 2*cs_b) = b22; \
|
||||
*(b + 2*rs_b + 3*cs_b) = b23; \
|
||||
\
|
||||
*(c + 2*rs_c + 0*cs_c) = b20; \
|
||||
*(c + 2*rs_c + 1*cs_c) = b21; \
|
||||
*(c + 2*rs_c + 2*cs_c) = b22; \
|
||||
@@ -188,6 +194,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 3*rs_b + 1*cs_b) = b31; \
|
||||
*(b + 3*rs_b + 2*cs_b) = b32; \
|
||||
*(b + 3*rs_b + 3*cs_b) = b33; \
|
||||
\
|
||||
*(c + 3*rs_c + 0*cs_c) = b30; \
|
||||
*(c + 3*rs_c + 1*cs_c) = b31; \
|
||||
*(c + 3*rs_c + 2*cs_c) = b32; \
|
||||
|
||||
@@ -68,14 +68,17 @@ void PASTEMAC(ch,varname)( \
|
||||
b01 = *(b + 0*rs_b + 1*cs_b); \
|
||||
b02 = *(b + 0*rs_b + 2*cs_b); \
|
||||
b03 = *(b + 0*rs_b + 3*cs_b); \
|
||||
\
|
||||
b10 = *(b + 1*rs_b + 0*cs_b); \
|
||||
b11 = *(b + 1*rs_b + 1*cs_b); \
|
||||
b12 = *(b + 1*rs_b + 2*cs_b); \
|
||||
b13 = *(b + 1*rs_b + 3*cs_b); \
|
||||
\
|
||||
b20 = *(b + 2*rs_b + 0*cs_b); \
|
||||
b21 = *(b + 2*rs_b + 1*cs_b); \
|
||||
b22 = *(b + 2*rs_b + 2*cs_b); \
|
||||
b23 = *(b + 2*rs_b + 3*cs_b); \
|
||||
\
|
||||
b30 = *(b + 3*rs_b + 0*cs_b); \
|
||||
b31 = *(b + 3*rs_b + 1*cs_b); \
|
||||
b32 = *(b + 3*rs_b + 2*cs_b); \
|
||||
@@ -95,6 +98,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 3*rs_b + 1*cs_b) = b31; \
|
||||
*(b + 3*rs_b + 2*cs_b) = b32; \
|
||||
*(b + 3*rs_b + 3*cs_b) = b33; \
|
||||
\
|
||||
*(c + 3*rs_c + 0*cs_c) = b30; \
|
||||
*(c + 3*rs_c + 1*cs_c) = b31; \
|
||||
*(c + 3*rs_c + 2*cs_c) = b32; \
|
||||
@@ -120,6 +124,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 2*rs_b + 1*cs_b) = b21; \
|
||||
*(b + 2*rs_b + 2*cs_b) = b22; \
|
||||
*(b + 2*rs_b + 3*cs_b) = b23; \
|
||||
\
|
||||
*(c + 2*rs_c + 0*cs_c) = b20; \
|
||||
*(c + 2*rs_c + 1*cs_c) = b21; \
|
||||
*(c + 2*rs_c + 2*cs_c) = b22; \
|
||||
@@ -151,6 +156,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 1*rs_b + 1*cs_b) = b11; \
|
||||
*(b + 1*rs_b + 2*cs_b) = b12; \
|
||||
*(b + 1*rs_b + 3*cs_b) = b13; \
|
||||
\
|
||||
*(c + 1*rs_c + 0*cs_c) = b10; \
|
||||
*(c + 1*rs_c + 1*cs_c) = b11; \
|
||||
*(c + 1*rs_c + 2*cs_c) = b12; \
|
||||
@@ -188,6 +194,7 @@ void PASTEMAC(ch,varname)( \
|
||||
*(b + 0*rs_b + 1*cs_b) = b01; \
|
||||
*(b + 0*rs_b + 2*cs_b) = b02; \
|
||||
*(b + 0*rs_b + 3*cs_b) = b03; \
|
||||
\
|
||||
*(c + 0*rs_c + 0*cs_c) = b00; \
|
||||
*(c + 0*rs_c + 1*cs_c) = b01; \
|
||||
*(c + 0*rs_c + 2*cs_c) = b02; \
|
||||
|
||||
Reference in New Issue
Block a user