From bef24e67e0f93579c2a80315348dc2e227f72a72 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Wed, 26 Nov 2014 18:00:56 -0600 Subject: [PATCH] Fixed a type of race condition exposed by pthreads implementation. Lead thread of the inner thread communicator could exit subproblem, move on the next iteration of the loop and modify a1_pack, b1_pack, or c1_pack while other threads were still using those. Barriers were inserted to fix this. --- frame/3/gemm/bli_gemm_blk_var1f.c | 2 ++ frame/3/gemm/bli_gemm_blk_var2f.c | 2 ++ frame/3/gemm/bli_gemm_blk_var3f.c | 2 +- frame/3/herk/bli_herk_blk_var1f.c | 2 ++ frame/3/herk/bli_herk_blk_var2f.c | 2 ++ frame/3/herk/bli_herk_blk_var3f.c | 2 +- frame/3/herk/bli_herk_threading.c | 44 ++++++++++++++++++++++--------- frame/3/trmm/bli_trmm_blk_var1f.c | 1 + frame/3/trmm/bli_trmm_blk_var2b.c | 1 + frame/3/trmm/bli_trmm_blk_var2f.c | 1 + frame/3/trmm/bli_trmm_blk_var3b.c | 1 + frame/3/trmm/bli_trmm_blk_var3f.c | 1 + frame/3/trmm/bli_trmm_threading.c | 38 +++++++++++++++++++------- frame/3/trsm/bli_trsm_blk_var1b.c | 1 + frame/3/trsm/bli_trsm_blk_var1f.c | 1 + frame/3/trsm/bli_trsm_blk_var2b.c | 1 + frame/3/trsm/bli_trsm_blk_var2f.c | 1 + frame/3/trsm/bli_trsm_blk_var3b.c | 2 +- frame/3/trsm/bli_trsm_blk_var3f.c | 2 +- frame/3/trsm/bli_trsm_threading.c | 39 ++++++++++++++++++++------- 20 files changed, 109 insertions(+), 37 deletions(-) diff --git a/frame/3/gemm/bli_gemm_blk_var1f.c b/frame/3/gemm/bli_gemm_blk_var1f.c index ad67dca7a..d42839214 100644 --- a/frame/3/gemm/bli_gemm_blk_var1f.c +++ b/frame/3/gemm/bli_gemm_blk_var1f.c @@ -130,6 +130,8 @@ void bli_gemm_blk_var1f( obj_t* a, c1_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); + + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread diff --git a/frame/3/gemm/bli_gemm_blk_var2f.c b/frame/3/gemm/bli_gemm_blk_var2f.c index d6e5b3ec5..26b5d2e52 100644 --- a/frame/3/gemm/bli_gemm_blk_var2f.c +++ b/frame/3/gemm/bli_gemm_blk_var2f.c @@ -129,6 +129,8 @@ void bli_gemm_blk_var2f( obj_t* a, c1_pack, cntl_sub_gemm( cntl ), gemm_thread_sub_gemm( thread ) ); + + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). // Currently must be done by 1 thread diff --git a/frame/3/gemm/bli_gemm_blk_var3f.c b/frame/3/gemm/bli_gemm_blk_var3f.c index 3ab9fb307..996f465a4 100644 --- a/frame/3/gemm/bli_gemm_blk_var3f.c +++ b/frame/3/gemm/bli_gemm_blk_var3f.c @@ -131,7 +131,7 @@ void bli_gemm_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) thread_ibarrier( thread ); + thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } diff --git a/frame/3/herk/bli_herk_blk_var1f.c b/frame/3/herk/bli_herk_blk_var1f.c index ae82121f1..11cc5edcf 100644 --- a/frame/3/herk/bli_herk_blk_var1f.c +++ b/frame/3/herk/bli_herk_blk_var1f.c @@ -127,6 +127,8 @@ void bli_herk_blk_var1f( obj_t* a, cntl_sub_gemm( cntl ), herk_thread_sub_herk( thread ) ); + thread_ibarrier( thread ); + // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, cntl_sub_unpackm_c( cntl ), diff --git a/frame/3/herk/bli_herk_blk_var2f.c b/frame/3/herk/bli_herk_blk_var2f.c index 2bf16ec0f..f491d985f 100644 --- a/frame/3/herk/bli_herk_blk_var2f.c +++ b/frame/3/herk/bli_herk_blk_var2f.c @@ -142,6 +142,8 @@ void bli_herk_blk_var2f( obj_t* a, cntl_sub_gemm( cntl ), herk_thread_sub_herk( thread ) ); + thread_ibarrier( thread ); + // Unpack C1 (if C1 was packed). bli_unpackm_int( c1S_pack, &c1S, cntl_sub_unpackm_c( cntl ), diff --git a/frame/3/herk/bli_herk_blk_var3f.c b/frame/3/herk/bli_herk_blk_var3f.c index 6efa3ba1a..cb54f717b 100644 --- a/frame/3/herk/bli_herk_blk_var3f.c +++ b/frame/3/herk/bli_herk_blk_var3f.c @@ -128,7 +128,7 @@ void bli_herk_blk_var3f( obj_t* a, // And since c_pack is a local obj_t, we can simply overwrite the // internal beta scalar with BLIS_ONE once it has been used in the // first iteration. - if ( i == 0 ) thread_ibarrier( thread ); + thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack ); } diff --git a/frame/3/herk/bli_herk_threading.c b/frame/3/herk/bli_herk_threading.c index 261d20a45..8bc3d3e23 100644 --- a/frame/3/herk/bli_herk_threading.c +++ b/frame/3/herk/bli_herk_threading.c @@ -160,38 +160,56 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( ) dim_t jc_comm_id = b*kc_nt + kc_comm_id; dim_t global_comm_id = a*jc_nt + jc_comm_id; + // Macrokernel loops herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id, ir_comm, ir_comm_id, - ir_way, e, + ir_way, e, NULL, NULL, NULL); herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id, jr_comm, jr_comm_id, - jr_way, d, + jr_way, d, NULL, NULL, ir_info); - - packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + //blk_var_1 + packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id, jr_comm, jr_comm_id, ic_nt, ic_comm_id ); + packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id, ic_comm, ic_comm_id, - ic_way, c, - packb, packa, jr_info); + ic_way, c, + pack_ic_out, pack_ic_in, jr_info); + //blk_var_3 + packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id, + jc_comm, jc_comm_id, + jc_nt, jc_comm_id ); herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id, kc_comm, kc_comm_id, kc_way, b, - NULL, NULL, ic_info); + pack_kc_out, pack_kc_in, ic_info); + + //blk_var_2 + packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + jc_nt, jc_comm_id ); + + packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id, + jc_comm, jc_comm_id, + global_num_threads, global_comm_id ); herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id, jc_comm, jc_comm_id, - jc_way, a, - NULL, NULL, kc_info); + jc_way, a, + pack_jc_out, pack_jc_in, kc_info); paths[global_comm_id] = jc_info; } diff --git a/frame/3/trmm/bli_trmm_blk_var1f.c b/frame/3/trmm/bli_trmm_blk_var1f.c index 2f82ddcb2..906d5f875 100644 --- a/frame/3/trmm/bli_trmm_blk_var1f.c +++ b/frame/3/trmm/bli_trmm_blk_var1f.c @@ -138,6 +138,7 @@ void bli_trmm_blk_var1f( obj_t* a, c1_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, diff --git a/frame/3/trmm/bli_trmm_blk_var2b.c b/frame/3/trmm/bli_trmm_blk_var2b.c index 5a368ca21..e7f92ec87 100644 --- a/frame/3/trmm/bli_trmm_blk_var2b.c +++ b/frame/3/trmm/bli_trmm_blk_var2b.c @@ -126,6 +126,7 @@ void bli_trmm_blk_var2b( obj_t* a, c1_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, diff --git a/frame/3/trmm/bli_trmm_blk_var2f.c b/frame/3/trmm/bli_trmm_blk_var2f.c index e3e32bd02..6af42b061 100644 --- a/frame/3/trmm/bli_trmm_blk_var2f.c +++ b/frame/3/trmm/bli_trmm_blk_var2f.c @@ -126,6 +126,7 @@ void bli_trmm_blk_var2f( obj_t* a, c1_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, diff --git a/frame/3/trmm/bli_trmm_blk_var3b.c b/frame/3/trmm/bli_trmm_blk_var3b.c index 492c2e034..fc355deed 100644 --- a/frame/3/trmm/bli_trmm_blk_var3b.c +++ b/frame/3/trmm/bli_trmm_blk_var3b.c @@ -124,6 +124,7 @@ void bli_trmm_blk_var3b( obj_t* a, c_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); + thread_ibarrier( thread ); } thread_obarrier( thread ); diff --git a/frame/3/trmm/bli_trmm_blk_var3f.c b/frame/3/trmm/bli_trmm_blk_var3f.c index 02e2d6462..8e385e74d 100644 --- a/frame/3/trmm/bli_trmm_blk_var3f.c +++ b/frame/3/trmm/bli_trmm_blk_var3f.c @@ -124,6 +124,7 @@ void bli_trmm_blk_var3f( obj_t* a, c_pack, cntl_sub_gemm( cntl ), trmm_thread_sub_trmm( thread ) ); + thread_ibarrier( thread ); } thread_obarrier( thread ); diff --git a/frame/3/trmm/bli_trmm_threading.c b/frame/3/trmm/bli_trmm_threading.c index 43afa6fc2..d15a45b0c 100644 --- a/frame/3/trmm/bli_trmm_threading.c +++ b/frame/3/trmm/bli_trmm_threading.c @@ -164,7 +164,8 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ) dim_t kc_comm_id = c*ic_nt + ic_comm_id; dim_t jc_comm_id = b*kc_nt + kc_comm_id; dim_t global_comm_id = a*jc_nt + jc_comm_id; - + + // Macrokernel loops trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id, ir_comm, ir_comm_id, ir_way, e, @@ -174,29 +175,46 @@ trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency ) jr_comm, jr_comm_id, jr_way, d, NULL, NULL, ir_info); - - packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, + //blk_var_1 + packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id, jr_comm, jr_comm_id, ic_nt, ic_comm_id ); + packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id, ic_comm, ic_comm_id, ic_way, c, - packb, packa, jr_info); + pack_ic_out, pack_ic_in, jr_info); + //blk_var_3 + packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id, + jc_comm, jc_comm_id, + jc_nt, jc_comm_id ); trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id, kc_comm, kc_comm_id, kc_way, b, - NULL, NULL, ic_info); + pack_kc_out, pack_kc_in, ic_info); + //blk_var_2 + packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + jc_nt, jc_comm_id ); + + packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id, + jc_comm, jc_comm_id, + global_num_threads, global_comm_id ); trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id, jc_comm, jc_comm_id, jc_way, a, - NULL, NULL, kc_info); + pack_jc_out, pack_jc_in, kc_info); + paths[global_comm_id] = jc_info; } } diff --git a/frame/3/trsm/bli_trsm_blk_var1b.c b/frame/3/trsm/bli_trsm_blk_var1b.c index 5f72b44ad..9b69d86bd 100644 --- a/frame/3/trsm/bli_trsm_blk_var1b.c +++ b/frame/3/trsm/bli_trsm_blk_var1b.c @@ -121,6 +121,7 @@ void bli_trsm_blk_var1b( obj_t* a, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trsm/bli_trsm_blk_var1f.c b/frame/3/trsm/bli_trsm_blk_var1f.c index f1696119a..f697011ff 100644 --- a/frame/3/trsm/bli_trsm_blk_var1f.c +++ b/frame/3/trsm/bli_trsm_blk_var1f.c @@ -120,6 +120,7 @@ void bli_trsm_blk_var1f( obj_t* a, &c1, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); + thread_ibarrier( thread ); } // If any packing buffers were acquired within packm, release them back diff --git a/frame/3/trsm/bli_trsm_blk_var2b.c b/frame/3/trsm/bli_trsm_blk_var2b.c index 848c1a0eb..6802a625d 100644 --- a/frame/3/trsm/bli_trsm_blk_var2b.c +++ b/frame/3/trsm/bli_trsm_blk_var2b.c @@ -128,6 +128,7 @@ void bli_trsm_blk_var2b( obj_t* a, c1_pack, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, diff --git a/frame/3/trsm/bli_trsm_blk_var2f.c b/frame/3/trsm/bli_trsm_blk_var2f.c index 8fb03fa77..25eedd431 100644 --- a/frame/3/trsm/bli_trsm_blk_var2f.c +++ b/frame/3/trsm/bli_trsm_blk_var2f.c @@ -128,6 +128,7 @@ void bli_trsm_blk_var2f( obj_t* a, c1_pack, cntl_sub_trsm( cntl ), trsm_thread_sub_trsm( thread ) ); + thread_ibarrier( thread ); // Unpack C1 (if C1 was packed). bli_unpackm_int( c1_pack, &c1, diff --git a/frame/3/trsm/bli_trsm_blk_var3b.c b/frame/3/trsm/bli_trsm_blk_var3b.c index 052ae9df7..db4a36135 100644 --- a/frame/3/trsm/bli_trsm_blk_var3b.c +++ b/frame/3/trsm/bli_trsm_blk_var3b.c @@ -129,7 +129,7 @@ void bli_trsm_blk_var3b( obj_t* a, // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) thread_ibarrier( thread ); + thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); diff --git a/frame/3/trsm/bli_trsm_blk_var3f.c b/frame/3/trsm/bli_trsm_blk_var3f.c index 9be752bdb..baf7f3502 100644 --- a/frame/3/trsm/bli_trsm_blk_var3f.c +++ b/frame/3/trsm/bli_trsm_blk_var3f.c @@ -129,7 +129,7 @@ void bli_trsm_blk_var3f( obj_t* a, // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. - if ( i == 0 ) thread_ibarrier( thread ); + thread_ibarrier( thread ); if ( i == 0 && thread_am_ichief( thread ) ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); diff --git a/frame/3/trsm/bli_trsm_threading.c b/frame/3/trsm/bli_trsm_threading.c index b60a28d39..1cfd61ebb 100644 --- a/frame/3/trsm/bli_trsm_threading.c +++ b/frame/3/trsm/bli_trsm_threading.c @@ -167,6 +167,8 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided ) dim_t jc_comm_id = b*kc_nt + kc_comm_id; dim_t global_comm_id = a*jc_nt + jc_comm_id; + + // Macrokernel loops trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id, ir_comm, ir_comm_id, ir_way, e, @@ -176,29 +178,46 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided ) jr_comm, jr_comm_id, jr_way, d, NULL, NULL, ir_info); + //blk_var_1 + packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id, + jr_comm, jr_comm_id, + ic_nt, ic_comm_id ); - packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id, - ic_comm, ic_comm_id, - kc_nt, kc_comm_id ); - - packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id, - jr_comm, jr_comm_id, - ic_nt, ic_comm_id ); + packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id, ic_comm, ic_comm_id, ic_way, c, - packb, packa, jr_info); + pack_ic_out, pack_ic_in, jr_info); + //blk_var_3 + packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id, + ic_comm, ic_comm_id, + kc_nt, kc_comm_id ); + + packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id, + jc_comm, jc_comm_id, + jc_nt, jc_comm_id ); trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id, kc_comm, kc_comm_id, kc_way, b, - NULL, NULL, ic_info); + pack_kc_out, pack_kc_in, ic_info); + //blk_var_2 + packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id, + kc_comm, kc_comm_id, + jc_nt, jc_comm_id ); + + packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id, + jc_comm, jc_comm_id, + global_num_threads, global_comm_id ); trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id, jc_comm, jc_comm_id, jc_way, a, - NULL, NULL, kc_info); + pack_jc_out, pack_jc_in, kc_info); + paths[global_comm_id] = jc_info; } }