Fixed bug in reference gemm ukernels.

Details:
- Fixed a bug whereby, for the reference gemm ukernels, the matrix product
  was not correctly accumulated and scaled (by alpha) into the output matrix
  C. (Thanks to Fran for finding this bug.)
- Whitespace changes to reference trsm kernels.
This commit is contained in:
Field G. Van Zee
2012-12-14 12:45:26 -06:00
parent e2e7cb2fbe
commit 0670c33cc1
5 changed files with 39 additions and 25 deletions

View File

@@ -121,15 +121,15 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,scals)( *beta, *c21 ); \
PASTEMAC(ch,scals)( *beta, *c31 ); \
\
PASTEMAC(ch,dots)( *c00, *alpha, ab00 ); \
PASTEMAC(ch,dots)( *c10, *alpha, ab10 ); \
PASTEMAC(ch,dots)( *c20, *alpha, ab20 ); \
PASTEMAC(ch,dots)( *c30, *alpha, ab30 ); \
PASTEMAC(ch,dots)( *alpha, ab00, *c00 ); \
PASTEMAC(ch,dots)( *alpha, ab10, *c10 ); \
PASTEMAC(ch,dots)( *alpha, ab20, *c20 ); \
PASTEMAC(ch,dots)( *alpha, ab30, *c30 ); \
\
PASTEMAC(ch,dots)( *c01, *alpha, ab01 ); \
PASTEMAC(ch,dots)( *c11, *alpha, ab11 ); \
PASTEMAC(ch,dots)( *c21, *alpha, ab21 ); \
PASTEMAC(ch,dots)( *c31, *alpha, ab31 ); \
PASTEMAC(ch,dots)( *alpha, ab01, *c01 ); \
PASTEMAC(ch,dots)( *alpha, ab11, *c11 ); \
PASTEMAC(ch,dots)( *alpha, ab21, *c21 ); \
PASTEMAC(ch,dots)( *alpha, ab31, *c31 ); \
}
INSERT_GENTFUNC_BASIC( gemm_ref_4x2, gemm_ref_4x2 )

View File

@@ -163,25 +163,25 @@ void PASTEMAC(ch,varname)( \
PASTEMAC(ch,scals)( *beta, *c23 ); \
PASTEMAC(ch,scals)( *beta, *c33 ); \
\
PASTEMAC(ch,dots)( *c00, *alpha, ab00 ); \
PASTEMAC(ch,dots)( *c10, *alpha, ab10 ); \
PASTEMAC(ch,dots)( *c20, *alpha, ab20 ); \
PASTEMAC(ch,dots)( *c30, *alpha, ab30 ); \
PASTEMAC(ch,dots)( *alpha, ab00, *c00 ); \
PASTEMAC(ch,dots)( *alpha, ab10, *c10 ); \
PASTEMAC(ch,dots)( *alpha, ab20, *c20 ); \
PASTEMAC(ch,dots)( *alpha, ab30, *c30 ); \
\
PASTEMAC(ch,dots)( *c01, *alpha, ab01 ); \
PASTEMAC(ch,dots)( *c11, *alpha, ab11 ); \
PASTEMAC(ch,dots)( *c21, *alpha, ab21 ); \
PASTEMAC(ch,dots)( *c31, *alpha, ab31 ); \
PASTEMAC(ch,dots)( *alpha, ab01, *c01 ); \
PASTEMAC(ch,dots)( *alpha, ab11, *c11 ); \
PASTEMAC(ch,dots)( *alpha, ab21, *c21 ); \
PASTEMAC(ch,dots)( *alpha, ab31, *c31 ); \
\
PASTEMAC(ch,dots)( *c02, *alpha, ab02 ); \
PASTEMAC(ch,dots)( *c12, *alpha, ab12 ); \
PASTEMAC(ch,dots)( *c22, *alpha, ab22 ); \
PASTEMAC(ch,dots)( *c32, *alpha, ab32 ); \
PASTEMAC(ch,dots)( *alpha, ab02, *c02 ); \
PASTEMAC(ch,dots)( *alpha, ab12, *c12 ); \
PASTEMAC(ch,dots)( *alpha, ab22, *c22 ); \
PASTEMAC(ch,dots)( *alpha, ab32, *c32 ); \
\
PASTEMAC(ch,dots)( *c03, *alpha, ab03 ); \
PASTEMAC(ch,dots)( *c13, *alpha, ab13 ); \
PASTEMAC(ch,dots)( *c23, *alpha, ab23 ); \
PASTEMAC(ch,dots)( *c33, *alpha, ab33 ); \
PASTEMAC(ch,dots)( *alpha, ab03, *c03 ); \
PASTEMAC(ch,dots)( *alpha, ab13, *c13 ); \
PASTEMAC(ch,dots)( *alpha, ab23, *c23 ); \
PASTEMAC(ch,dots)( *alpha, ab33, *c33 ); \
}
INSERT_GENTFUNC_BASIC( gemm_ref_4x4, gemm_ref_4x4 )

View File

@@ -68,14 +68,17 @@ void PASTEMAC(ch,varname)( \
b01 = *(b + 0*rs_b + 1*cs_b); \
b02 = *(b + 0*rs_b + 2*cs_b); \
b03 = *(b + 0*rs_b + 3*cs_b); \
\
b10 = *(b + 1*rs_b + 0*cs_b); \
b11 = *(b + 1*rs_b + 1*cs_b); \
b12 = *(b + 1*rs_b + 2*cs_b); \
b13 = *(b + 1*rs_b + 3*cs_b); \
\
b20 = *(b + 2*rs_b + 0*cs_b); \
b21 = *(b + 2*rs_b + 1*cs_b); \
b22 = *(b + 2*rs_b + 2*cs_b); \
b23 = *(b + 2*rs_b + 3*cs_b); \
\
b30 = *(b + 3*rs_b + 0*cs_b); \
b31 = *(b + 3*rs_b + 1*cs_b); \
b32 = *(b + 3*rs_b + 2*cs_b); \
@@ -95,6 +98,7 @@ void PASTEMAC(ch,varname)( \
*(b + 0*rs_b + 1*cs_b) = b01; \
*(b + 0*rs_b + 2*cs_b) = b02; \
*(b + 0*rs_b + 3*cs_b) = b03; \
\
*(c + 0*rs_c + 0*cs_c) = b00; \
*(c + 0*rs_c + 1*cs_c) = b01; \
*(c + 0*rs_c + 2*cs_c) = b02; \
@@ -120,6 +124,7 @@ void PASTEMAC(ch,varname)( \
*(b + 1*rs_b + 1*cs_b) = b11; \
*(b + 1*rs_b + 2*cs_b) = b12; \
*(b + 1*rs_b + 3*cs_b) = b13; \
\
*(c + 1*rs_c + 0*cs_c) = b10; \
*(c + 1*rs_c + 1*cs_c) = b11; \
*(c + 1*rs_c + 2*cs_c) = b12; \
@@ -151,6 +156,7 @@ void PASTEMAC(ch,varname)( \
*(b + 2*rs_b + 1*cs_b) = b21; \
*(b + 2*rs_b + 2*cs_b) = b22; \
*(b + 2*rs_b + 3*cs_b) = b23; \
\
*(c + 2*rs_c + 0*cs_c) = b20; \
*(c + 2*rs_c + 1*cs_c) = b21; \
*(c + 2*rs_c + 2*cs_c) = b22; \
@@ -188,6 +194,7 @@ void PASTEMAC(ch,varname)( \
*(b + 3*rs_b + 1*cs_b) = b31; \
*(b + 3*rs_b + 2*cs_b) = b32; \
*(b + 3*rs_b + 3*cs_b) = b33; \
\
*(c + 3*rs_c + 0*cs_c) = b30; \
*(c + 3*rs_c + 1*cs_c) = b31; \
*(c + 3*rs_c + 2*cs_c) = b32; \

View File

@@ -68,14 +68,17 @@ void PASTEMAC(ch,varname)( \
b01 = *(b + 0*rs_b + 1*cs_b); \
b02 = *(b + 0*rs_b + 2*cs_b); \
b03 = *(b + 0*rs_b + 3*cs_b); \
\
b10 = *(b + 1*rs_b + 0*cs_b); \
b11 = *(b + 1*rs_b + 1*cs_b); \
b12 = *(b + 1*rs_b + 2*cs_b); \
b13 = *(b + 1*rs_b + 3*cs_b); \
\
b20 = *(b + 2*rs_b + 0*cs_b); \
b21 = *(b + 2*rs_b + 1*cs_b); \
b22 = *(b + 2*rs_b + 2*cs_b); \
b23 = *(b + 2*rs_b + 3*cs_b); \
\
b30 = *(b + 3*rs_b + 0*cs_b); \
b31 = *(b + 3*rs_b + 1*cs_b); \
b32 = *(b + 3*rs_b + 2*cs_b); \
@@ -95,6 +98,7 @@ void PASTEMAC(ch,varname)( \
*(b + 3*rs_b + 1*cs_b) = b31; \
*(b + 3*rs_b + 2*cs_b) = b32; \
*(b + 3*rs_b + 3*cs_b) = b33; \
\
*(c + 3*rs_c + 0*cs_c) = b30; \
*(c + 3*rs_c + 1*cs_c) = b31; \
*(c + 3*rs_c + 2*cs_c) = b32; \
@@ -120,6 +124,7 @@ void PASTEMAC(ch,varname)( \
*(b + 2*rs_b + 1*cs_b) = b21; \
*(b + 2*rs_b + 2*cs_b) = b22; \
*(b + 2*rs_b + 3*cs_b) = b23; \
\
*(c + 2*rs_c + 0*cs_c) = b20; \
*(c + 2*rs_c + 1*cs_c) = b21; \
*(c + 2*rs_c + 2*cs_c) = b22; \
@@ -151,6 +156,7 @@ void PASTEMAC(ch,varname)( \
*(b + 1*rs_b + 1*cs_b) = b11; \
*(b + 1*rs_b + 2*cs_b) = b12; \
*(b + 1*rs_b + 3*cs_b) = b13; \
\
*(c + 1*rs_c + 0*cs_c) = b10; \
*(c + 1*rs_c + 1*cs_c) = b11; \
*(c + 1*rs_c + 2*cs_c) = b12; \
@@ -188,6 +194,7 @@ void PASTEMAC(ch,varname)( \
*(b + 0*rs_b + 1*cs_b) = b01; \
*(b + 0*rs_b + 2*cs_b) = b02; \
*(b + 0*rs_b + 3*cs_b) = b03; \
\
*(c + 0*rs_c + 0*cs_c) = b00; \
*(c + 0*rs_c + 1*cs_c) = b01; \
*(c + 0*rs_c + 2*cs_c) = b02; \

View File

@@ -1 +1 @@
0.0.1-1
0.0.1-2