From f97a86f322a6e3e31f33c89befc66189b0b8c64f Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Sat, 2 Jun 2018 20:28:20 -0500 Subject: [PATCH] Updated setting/querying pack schema (cntx->cntl). - Query pack schemas in level-3 bli_*_front() functions and store those values in the schema bitfields of the correponding obj_t's when the cntx's method is not BLIS_NAT. (When method is BLIS_NAT, the default native schemas are stored to the obj_t's.) - In bli_l3_cntl_create_if(), query the schemas stored to the obj_t's in bli_*_front(), clear the schema bitfields, and pass the queried values into bli_gemm_cntl_create() and bli_trsm_cntl_create(). - Updated APIs for bli_gemm_cntl_create() and bli_trsm_cntl_create() to take schemas for A and B, and use these values to initialize the appropriate control tree nodes. (Also cpp-disabled the panel-block cntl tree creation variant, bli_gemmpb_cntl_create(), as it has not been employed by BLIS in quite some time.) - Simplified querying of schema in bli_packm_init() thanks to above changes. - Updated openmp and pthreads definitions of bli_l3_thread_decorator() so that thread-local aliases of matrix operands are guaranteed, even if aliasing is disabled within the internal back-end functions (e.g. bli_gemm_int.c). Also added a comment to bli_thrcomm_single.c explaining why the extra aliasing is not needed there. - Change bli_gemm() and level-3 friends so that the operation's ind() function is called only if all matrix operands have the same datatype, and only if that datatype is complex. The former condition is needed in preparation for work related to mixed domain operands, while the latter helps with readability, especially for those who don't want to venture into frame/ind. - Reshuffled arguments in bli_cntx_set_thrloop_from_env() to be consistent with BLIS calling conventions (modified argument(s) are last), and updated all invocations in the level-3 _front() functions. - Comment updates to bli_cntx_set_thrloop_from_env(). --- frame/1m/packm/bli_packm_init.c | 53 +++++++++---- frame/3/bli_l3_cntl.c | 19 ++++- frame/3/bli_l3_oapi.c | 114 ++++++++++++++++++---------- frame/3/gemm/bli_gemm_cntl.c | 19 +++-- frame/3/gemm/bli_gemm_cntl.h | 12 ++- frame/3/gemm/bli_gemm_front.c | 32 +++++++- frame/3/hemm/bli_hemm_front.c | 32 +++++++- frame/3/her2k/bli_her2k_front.c | 36 ++++++++- frame/3/herk/bli_herk_front.c | 32 +++++++- frame/3/symm/bli_symm_front.c | 32 +++++++- frame/3/syr2k/bli_syr2k_front.c | 36 ++++++++- frame/3/syrk/bli_syrk_front.c | 32 +++++++- frame/3/trmm/bli_trmm_front.c | 32 +++++++- frame/3/trmm3/bli_trmm3_front.c | 32 +++++++- frame/3/trsm/bli_trsm_cntl.c | 24 +++--- frame/3/trsm/bli_trsm_cntl.h | 10 ++- frame/3/trsm/bli_trsm_front.c | 32 +++++++- frame/base/bli_cntx.c | 6 +- frame/base/bli_cntx.h | 15 ++-- frame/thread/bli_thrcomm_openmp.c | 17 +++-- frame/thread/bli_thrcomm_pthreads.c | 17 +++-- frame/thread/bli_thrcomm_single.c | 6 ++ 22 files changed, 503 insertions(+), 137 deletions(-) diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 404498d60..0437b722a 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -56,8 +56,8 @@ siz_t bli_packm_init bool_t does_invert_diag; bool_t rev_iter_if_upper; bool_t rev_iter_if_lower; - //pack_t pack_schema; - packbuf_t pack_buf_type; + pack_t schema; + //packbuf_t pack_buf_type; siz_t size_needed; // Check parameters. @@ -70,8 +70,8 @@ siz_t bli_packm_init does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + schema = bli_cntl_packm_params_pack_schema( cntl ); + //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); #if 0 // Let us now check to see if the object has already been packed. First @@ -112,30 +112,51 @@ siz_t bli_packm_init return 0; } - // We now ignore the pack_schema field in the control tree and - // extract the schema from the context, depending on whether we are - // preparing to pack a block of A or panel of B. For A and B, we must - // obtain the schema from the context since the induced methods reuse - // the same control trees used by native execution, and those induced - // methods specify the schema used by the current execution phase - // within the context (whereas the control tree does not change). +#if 0 pack_t schema; - if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + if ( bli_cntx_method( cntx ) != BLIS_NAT ) { - schema = bli_cntx_schema_a_block( cntx ); + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are + // preparing to pack a block of A or panel of B. For A and B, we must + // obtain the schema from the context since the induced methods reuse + // the same control trees used by native execution, and those induced + // methods specify the schema used by the current execution phase + // within the context (whereas the control tree does not change). + + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + { + schema = bli_cntx_schema_a_block( cntx ); + } + else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + { + schema = bli_cntx_schema_b_panel( cntx ); + } + else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + { + schema = bli_cntl_packm_params_pack_schema( cntl ); + } } - else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + else // ( bli_cntx_method( cntx ) == BLIS_NAT ) { - schema = bli_cntx_schema_b_panel( cntx ); + // For native execution, we obtain the schema from the control tree + // node. (Notice that it doesn't matter if the pack_buf_type is for + // A or B.) + schema = bli_cntl_packm_params_pack_schema( cntl ); } - else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + // This is no longer needed now that we branch between native and + // non-native cases above. +#if 0 + if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. schema = bli_cntl_packm_params_pack_schema( cntl ); } +#endif +#endif // Prepare a few other variables based on properties of the control // tree. diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 0ea06715a..33c64edcb 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -45,6 +45,21 @@ void bli_l3_cntl_create_if cntl_t** cntl_use ) { + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. Notice that we do this even if the + // caller passed in a custom control tree; that's because we still need + // to reset the pack schema of a and b, which were modified by the + // operation's _front() function. + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); + // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) @@ -53,7 +68,7 @@ void bli_l3_cntl_create_if family == BLIS_HERK || family == BLIS_TRMM ) { - *cntl_use = bli_gemm_cntl_create( family ); + *cntl_use = bli_gemm_cntl_create( family, schema_a, schema_b ); } else // if ( family == BLIS_TRSM ) { @@ -62,7 +77,7 @@ void bli_l3_cntl_create_if if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; - *cntl_use = bli_trsm_cntl_create( side ); + *cntl_use = bli_trsm_cntl_create( side, schema_a, schema_b ); } } else diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 5f4bc9932..94e563c24 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -57,20 +57,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - /* Invoke the operation's "ind" function--its induced method front-end. - This function will call native execution for real domain problems. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. */ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( gemm ) @@ -96,16 +101,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( hemm ) @@ -129,14 +143,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, beta, c, cntx ); \ + } \ } GENFRONT( herk ) @@ -159,14 +183,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ + bli_obj_is_complex( b ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, cntx ); \ + } \ } GENFRONT( trmm ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b17ce10ac..3e13f23fa 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -36,17 +36,21 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { - return bli_gemmbp_cntl_create( family ); + return bli_gemmbp_cntl_create( family, schema_a, schema_b ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_gemm_ker_var2; @@ -82,7 +86,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -106,7 +110,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); @@ -134,6 +138,10 @@ cntl_t* bli_gemmbp_cntl_create // ----------------------------------------------------------------------------- +// This control tree creation function is disabled because it is no longer used. +// (It was originally created in the run up to publishing the 1m journal article, +// but was disabled to reduce complexity.) +#if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family @@ -223,6 +231,7 @@ cntl_t* bli_gemmpb_cntl_create return gemm_cntl_vl_mm; } +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 3b643e1fc..3b3cb1cf2 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -34,20 +34,26 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); +#if 0 cntl_t* bli_gemmpb_cntl_create ( - opid_t family + opid_t family, ); +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index f2600d791..8aae5b476 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,10 +87,34 @@ void bli_gemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 2406ee1d5..b12424d63 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -88,10 +88,34 @@ void bli_hemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 9448b881e..15ee65fad 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -106,10 +106,38 @@ void bli_her2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HER2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bh_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bh_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 8b2379a66..f6e5b55a3 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -86,10 +86,34 @@ void bli_herk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HERK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 0c229ef9b..84263bc9d 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -87,10 +87,34 @@ void bli_symm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 32981cb89..769ca56a0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -87,10 +87,38 @@ void bli_syr2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYR2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bt_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bt_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index eed5f0ebc..7a66ad68a 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -80,10 +80,34 @@ void bli_syrk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYRK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index afdabbbd2..935972442 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -130,10 +130,34 @@ void bli_trmm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 664a7fd51..0f772f0fb 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -129,10 +129,34 @@ void bli_trmm3_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM3, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index e05fc3d20..df9b831a3 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -36,16 +36,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ) { - if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); - else return bli_trsm_r_cntl_create(); + if ( bli_is_left( side ) ) + return bli_trsm_l_cntl_create( schema_a, schema_b ); + else + return bli_trsm_r_cntl_create( schema_a, schema_b ); } cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -79,7 +84,7 @@ cntl_t* bli_trsm_l_cntl_create TRUE, // do NOT invert diagonal TRUE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -103,7 +108,7 @@ cntl_t* bli_trsm_l_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); @@ -131,7 +136,8 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -165,7 +171,7 @@ cntl_t* bli_trsm_r_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -189,7 +195,7 @@ cntl_t* bli_trsm_r_cntl_create TRUE, // do NOT invert diagonal FALSE, // reverse iteration if upper? TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index cfd20cad3..77c36aec2 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -34,17 +34,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); void bli_trsm_cntl_free diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 2bc6d0186..081a2c284 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -121,10 +121,34 @@ void bli_trsm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRSM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 981b01c3e..d36a20ded 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -869,10 +869,10 @@ void bli_cntx_set_thrloop_from_env ( opid_t l3_op, side_t side, - cntx_t* cntx, dim_t m, dim_t n, - dim_t k + dim_t k, + cntx_t* cntx ) { dim_t jc, pc, ic, jr, ir; @@ -988,7 +988,7 @@ void bli_cntx_set_thrloop_from_env ); } } - else // if ( l3_op == BLIS_GEMM || l3_op == BLIS_HERK ) + else // any other level-3 operation besides trmm/trsm { bli_cntx_set_thrloop ( diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index ac43312bc..4aaec97c4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -674,12 +674,15 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); void bli_cntx_set_packm_kers( dim_t n_kers, ... ); -void bli_cntx_set_thrloop_from_env( opid_t l3_op, - side_t side, - cntx_t* cntx, - dim_t m, - dim_t n, - dim_t k ); +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + dim_t m, + dim_t n, + dim_t k, + cntx_t* cntx + ); void bli_cntx_print( cntx_t* cntx ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 131f70973..f2197597f 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -231,11 +231,18 @@ void bli_l3_thread_decorator { dim_t id = omp_get_thread_num(); + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread functions. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -243,17 +250,17 @@ void bli_l3_thread_decorator func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index e2fa35c35..132fb6740 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -161,11 +161,18 @@ void* bli_l3_thread_entry( void* data_void ) dim_t id = data->id; thrcomm_t* gl_comm = data->gl_comm; + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread function. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -173,17 +180,17 @@ void* bli_l3_thread_entry( void* data_void ) func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index cb0bc2ae4..068b7eda5 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -94,6 +94,12 @@ void bli_l3_thread_decorator cntl_t* cntl_use; thrinfo_t* thread; + // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't + // need to alias objects for A, B, and C since they were already aliased + // in bli_*_front(). (We only needed thread-local copies so each could + // safely reset their internal (beta) scalars on c after the first + // iteration of the pc (kc) loop.) + // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );