mirror of
https://github.com/amd/blis.git
synced 2026-03-16 07:17:21 +00:00
Fixed bugs in cpackm kernels, gemmlike code.
Details: - Fixed intermittent bugs in bli_packm_haswell_asm_c3xk.c and bli_packm_haswell_asm_c8xk.c whereby the imaginary component of the kappa scalar was incorrectly loaded at an offset of 8 bytes (instead of 4 bytes) from the real component. This was almost certainly a copy- paste bug carried over from the corresonding zpackm kernels. Thanks to Devin Matthews for bringing this to my attention. - Added missing code to gemmlike sandbox files bls_gemm_bp_var1.c and bls_gemm_bp_var2.c that initializes the elements of the temporary microtile to zero. (This bug was never observed in output but rather noticed analytically. It probably would have also manifested as intermittent failures, this time involving edge cases.) - Minor commented-out/disabled changes to testsuite/src/test_gemm.c relating to debugging.
This commit is contained in:
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_3xk
|
||||
|
||||
mov(var(kappa), rcx) // load address of kappa
|
||||
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
|
||||
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
|
||||
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
|
||||
|
||||
|
||||
// now branch on kappa == 1.0
|
||||
|
||||
@@ -125,7 +125,7 @@ void bli_cpackm_haswell_asm_8xk
|
||||
|
||||
mov(var(kappa), rcx) // load address of kappa
|
||||
vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate
|
||||
vbroadcastss(mem(rcx, 8), ymm11) // load kappa_i and duplicate
|
||||
vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate
|
||||
|
||||
|
||||
// now branch on kappa == 1.0
|
||||
|
||||
@@ -230,6 +230,9 @@ void PASTECH2(bls_,ch,varname) \
|
||||
thrinfo_t* restrict thread_pa = NULL; \
|
||||
thrinfo_t* restrict thread_jr = NULL; \
|
||||
thrinfo_t* restrict thread_ir = NULL; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Identify the current thrinfo_t node and then grow the tree. */ \
|
||||
thread_jc = thread; \
|
||||
|
||||
@@ -538,6 +538,12 @@ void PASTECH2(bls_,ch,varname) \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype zero = *PASTEMAC(ch,0); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs.
|
||||
NOTE: This initialization should really be done statically since
|
||||
var2 executes this microkernel wrapper many times, and the overhead
|
||||
of touching the temporary microtile adds up. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( mr_cur == MR && nr_cur == NR ) \
|
||||
|
||||
@@ -176,17 +176,17 @@ void PASTECH2(bls_,ch,varname) \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
p_begin += ps_p; \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
p_begin += ps_p; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -254,18 +254,17 @@ void libblis_test_gemm_experiment
|
||||
bli_setsc( 0.9, 1.0, &beta );
|
||||
}
|
||||
|
||||
#if 0
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
bli_setsc( 1.0, 0.0, &alpha );
|
||||
bli_setsc( 1.0, 0.0, &beta );
|
||||
#endif
|
||||
|
||||
// Randomize A, B, and C, and save C.
|
||||
libblis_test_mobj_randomize( params, TRUE, &a );
|
||||
libblis_test_mobj_randomize( params, TRUE, &b );
|
||||
libblis_test_mobj_randomize( params, TRUE, &c );
|
||||
bli_copym( &c, &c_save );
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
//bli_setsc( 1.0, 0.0, &alpha );
|
||||
//bli_setsc( 0.0, 0.0, &beta );
|
||||
|
||||
//bli_setm( &BLIS_ONE, &a );
|
||||
//bli_setsc( 1.0, 0.0, &alpha );
|
||||
//bli_setsc( 0.0, 0.0, &beta );
|
||||
|
||||
// Apply the parameters.
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
@@ -456,11 +455,13 @@ bli_printm( "c", c, "%5.2f", "" );
|
||||
// bli_obj_stor3_from_strides( c, a, b ) == BLIS_CRR )
|
||||
//bli_printm( "c before", c, "%6.3f", "" );
|
||||
bli_gemm( alpha, a, b, beta, c );
|
||||
//bls_gemm( alpha, a, b, beta, c );
|
||||
#if 0
|
||||
if ( bli_obj_length( c ) == 12 &&
|
||||
bli_obj_stor3_from_strides( c, a, b ) == BLIS_RRR )
|
||||
bli_printm( "c after", c, "%6.3f", "" );
|
||||
#endif
|
||||
//bli_printm( "c after", c, "%5.2f", "" );
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
Reference in New Issue
Block a user