Fixed a type of race condition exposed by pthreads implementation.

Lead thread of the inner thread communicator could exit subproblem, move on the next iteration of the loop and modify a1_pack, b1_pack, or c1_pack while other threads were still using those.

Barriers were inserted to fix this.
This commit is contained in:
Tyler Smith
2014-11-26 18:00:56 -06:00
parent 76bde44411
commit bef24e67e0
20 changed files with 109 additions and 37 deletions

View File

@@ -130,6 +130,8 @@ void bli_gemm_blk_var1f( obj_t* a,
c1_pack,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread

View File

@@ -129,6 +129,8 @@ void bli_gemm_blk_var2f( obj_t* a,
c1_pack,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread

View File

@@ -131,7 +131,7 @@ void bli_gemm_blk_var3f( obj_t* a,
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 ) thread_ibarrier( thread );
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}

View File

@@ -127,6 +127,8 @@ void bli_herk_blk_var1f( obj_t* a,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),

View File

@@ -142,6 +142,8 @@ void bli_herk_blk_var2f( obj_t* a,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1S_pack, &c1S,
cntl_sub_unpackm_c( cntl ),

View File

@@ -128,7 +128,7 @@ void bli_herk_blk_var3f( obj_t* a,
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 ) thread_ibarrier( thread );
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}

View File

@@ -160,38 +160,56 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
ir_way, e,
NULL, NULL, NULL);
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
//blk_var_1
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
ic_way, c,
pack_ic_out, pack_ic_in, jr_info);
//blk_var_3
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
pack_kc_out, pack_kc_in, ic_info);
//blk_var_2
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
jc_way, a,
pack_jc_out, pack_jc_in, kc_info);
paths[global_comm_id] = jc_info;
}

View File

@@ -138,6 +138,7 @@ void bli_trmm_blk_var1f( obj_t* a,
c1_pack,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,

View File

@@ -126,6 +126,7 @@ void bli_trmm_blk_var2b( obj_t* a,
c1_pack,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,

View File

@@ -126,6 +126,7 @@ void bli_trmm_blk_var2f( obj_t* a,
c1_pack,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,

View File

@@ -124,6 +124,7 @@ void bli_trmm_blk_var3b( obj_t* a,
c_pack,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
}
thread_obarrier( thread );

View File

@@ -124,6 +124,7 @@ void bli_trmm_blk_var3f( obj_t* a,
c_pack,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
}
thread_obarrier( thread );

View File

@@ -164,7 +164,8 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
@@ -174,29 +175,46 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
//blk_var_1
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
pack_ic_out, pack_ic_in, jr_info);
//blk_var_3
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
pack_kc_out, pack_kc_in, ic_info);
//blk_var_2
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
pack_jc_out, pack_jc_in, kc_info);
paths[global_comm_id] = jc_info;
}
}

View File

@@ -121,6 +121,7 @@ void bli_trsm_blk_var1b( obj_t* a,
&c1,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -120,6 +120,7 @@ void bli_trsm_blk_var1f( obj_t* a,
&c1,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
thread_ibarrier( thread );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -128,6 +128,7 @@ void bli_trsm_blk_var2b( obj_t* a,
c1_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,

View File

@@ -128,6 +128,7 @@ void bli_trsm_blk_var2f( obj_t* a,
c1_pack,
cntl_sub_trsm( cntl ),
trsm_thread_sub_trsm( thread ) );
thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,

View File

@@ -129,7 +129,7 @@ void bli_trsm_blk_var3b( obj_t* a,
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure
// that they are only used in the first iteration.
if ( i == 0 ) thread_ibarrier( thread );
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) {
bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );

View File

@@ -129,7 +129,7 @@ void bli_trsm_blk_var3f( obj_t* a,
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure
// that they are only used in the first iteration.
if ( i == 0 ) thread_ibarrier( thread );
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) {
bli_obj_scalar_reset( a );
bli_obj_scalar_reset( b );

View File

@@ -167,6 +167,8 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
@@ -176,29 +178,46 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
//blk_var_1
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
pack_ic_out, pack_ic_in, jr_info);
//blk_var_3
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
pack_kc_out, pack_kc_in, ic_info);
//blk_var_2
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
pack_jc_out, pack_jc_in, kc_info);
paths[global_comm_id] = jc_info;
}
}