mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Fixed a type of race condition exposed by pthreads implementation.
Lead thread of the inner thread communicator could exit subproblem, move on the next iteration of the loop and modify a1_pack, b1_pack, or c1_pack while other threads were still using those. Barriers were inserted to fix this.
This commit is contained in:
@@ -130,6 +130,8 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
|
||||
@@ -129,6 +129,8 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
|
||||
@@ -131,7 +131,7 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
@@ -127,6 +127,8 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
|
||||
@@ -142,6 +142,8 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1S_pack, &c1S,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
|
||||
@@ -128,7 +128,7 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
@@ -160,38 +160,56 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
// Macrokernel loops
|
||||
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
//blk_var_1
|
||||
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
ic_way, c,
|
||||
pack_ic_out, pack_ic_in, jr_info);
|
||||
//blk_var_3
|
||||
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
pack_kc_out, pack_kc_in, ic_info);
|
||||
|
||||
//blk_var_2
|
||||
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
jc_way, a,
|
||||
pack_jc_out, pack_jc_in, kc_info);
|
||||
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
|
||||
@@ -138,6 +138,7 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
|
||||
@@ -126,6 +126,7 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
|
||||
@@ -126,6 +126,7 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
|
||||
@@ -124,6 +124,7 @@ void bli_trmm_blk_var3b( obj_t* a,
|
||||
c_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
@@ -124,6 +124,7 @@ void bli_trmm_blk_var3f( obj_t* a,
|
||||
c_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
@@ -164,7 +164,8 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
|
||||
// Macrokernel loops
|
||||
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
@@ -174,29 +175,46 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
//blk_var_1
|
||||
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
pack_ic_out, pack_ic_in, jr_info);
|
||||
//blk_var_3
|
||||
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
pack_kc_out, pack_kc_in, ic_info);
|
||||
//blk_var_2
|
||||
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
pack_jc_out, pack_jc_in, kc_info);
|
||||
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +121,7 @@ void bli_trsm_blk_var1b( obj_t* a,
|
||||
&c1,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -120,6 +120,7 @@ void bli_trsm_blk_var1f( obj_t* a,
|
||||
&c1,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -128,6 +128,7 @@ void bli_trsm_blk_var2b( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
|
||||
@@ -128,6 +128,7 @@ void bli_trsm_blk_var2f( obj_t* a,
|
||||
c1_pack,
|
||||
cntl_sub_trsm( cntl ),
|
||||
trsm_thread_sub_trsm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
|
||||
@@ -129,7 +129,7 @@ void bli_trsm_blk_var3b( obj_t* a,
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
// that they are only used in the first iteration.
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) {
|
||||
bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
|
||||
@@ -129,7 +129,7 @@ void bli_trsm_blk_var3f( obj_t* a,
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal alpha scalars on A/B and C are non-zero, we must ensure
|
||||
// that they are only used in the first iteration.
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) {
|
||||
bli_obj_scalar_reset( a );
|
||||
bli_obj_scalar_reset( b );
|
||||
|
||||
@@ -167,6 +167,8 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
|
||||
// Macrokernel loops
|
||||
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
@@ -176,29 +178,46 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
//blk_var_1
|
||||
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
pack_ic_out, pack_ic_in, jr_info);
|
||||
//blk_var_3
|
||||
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
pack_kc_out, pack_kc_in, ic_info);
|
||||
//blk_var_2
|
||||
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
pack_jc_out, pack_jc_in, kc_info);
|
||||
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user