Fixed bugs in cpackm kernels, gemmlike code.

Details:
- Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and
  bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the
  kappa scalar was incorrectly loaded at an offset of 8 bytes (instead
  of 4 bytes) from the real component. This was almost certainly a copy-
  paste bug carried over from the corresonding zpackm kernels. Thanks to
  Devin Matthews for bringing this to my attention.
- Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and
  bls_gemm_bp_var2.c that initializes the elements of the temporary
  microtile to zero. (This bug was never observed in output but rather
  noticed analytically. It probably would have also manifested as
  intermittent failures, this time involving edge cases.)
- Minor commented-out/disabled changes to testsuite/src/test_gemm.c
  relating to debugging.
This commit is contained in:
Field G. Van Zee
2021-05-31 16:50:18 -05:00
parent 213dce32d2
commit 7f7d72610c
6 changed files with 26 additions and 16 deletions

View File

@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk
mov(var(kappa), rcx) // load address of kappa
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
// now branch on kappa == 1.0

View File

@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk
mov(var(kappa), rcx) // load address of kappa
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
// now branch on kappa == 1.0

View File

@@ -230,6 +230,9 @@ void PASTECH2(bls_,ch,varname) \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
thrinfo_t* restrict thread_ir = NULL; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
\
/* Identify the current thrinfo_t node and then grow the tree. */ \
thread_jc = thread; \

View File

@@ -538,6 +538,12 @@ void PASTECH2(bls_,ch,varname) \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype zero = *PASTEMAC(ch,0); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs.
NOTE: This initialization should really be done statically since
var2 executes this microkernel wrapper many times, and the overhead
of touching the temporary microtile adds up. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
\
/* Handle interior and edge cases separately. */ \
if ( mr_cur == MR && nr_cur == NR ) \

View File

@@ -176,17 +176,17 @@ void PASTECH2(bls_,ch,varname) \
cntx \
); \
} \
\
p_begin += ps_p; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
p_begin += ps_p; \
} \
}

View File

@@ -254,18 +254,17 @@ void libblis_test_gemm_experiment
bli_setsc( 0.9, 1.0, &beta );
}
#if 0
//bli_setm( &BLIS_ONE, &a );
bli_setsc( 1.0, 0.0, &alpha );
bli_setsc( 1.0, 0.0, &beta );
#endif
// Randomize A, B, and C, and save C.
libblis_test_mobj_randomize( params, TRUE, &a );
libblis_test_mobj_randomize( params, TRUE, &b );
libblis_test_mobj_randomize( params, TRUE, &c );
bli_copym( &c, &c_save );
//bli_setm( &BLIS_ONE, &a );
//bli_setsc( 1.0, 0.0, &alpha );
//bli_setsc( 0.0, 0.0, &beta );
//bli_setm( &BLIS_ONE, &a );
//bli_setsc( 1.0, 0.0, &alpha );
//bli_setsc( 0.0, 0.0, &beta );
// Apply the parameters.
bli_obj_set_conjtrans( transa, &a );
@@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" );
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
//bli_printm( "c before", c, "%6.3f", "" );
bli_gemm( alpha, a, b, beta, c );
//bls_gemm( alpha, a, b, beta, c );
#if 0
if ( bli_obj_length( c ) == 12 &&
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
bli_printm( "c after", c, "%6.3f", "" );
#endif
//bli_printm( "c after", c, "%5.2f", "" );
break;
default: