diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index 404498d60..0437b722a 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -56,8 +56,8 @@ siz_t bli_packm_init bool_t does_invert_diag; bool_t rev_iter_if_upper; bool_t rev_iter_if_lower; - //pack_t pack_schema; - packbuf_t pack_buf_type; + pack_t schema; + //packbuf_t pack_buf_type; siz_t size_needed; // Check parameters. @@ -70,8 +70,8 @@ siz_t bli_packm_init does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl ); rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl ); rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl ); - //pack_schema = bli_cntl_packm_params_pack_schema( cntl ); - pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); + schema = bli_cntl_packm_params_pack_schema( cntl ); + //pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); #if 0 // Let us now check to see if the object has already been packed. First @@ -112,30 +112,51 @@ siz_t bli_packm_init return 0; } - // We now ignore the pack_schema field in the control tree and - // extract the schema from the context, depending on whether we are - // preparing to pack a block of A or panel of B. For A and B, we must - // obtain the schema from the context since the induced methods reuse - // the same control trees used by native execution, and those induced - // methods specify the schema used by the current execution phase - // within the context (whereas the control tree does not change). +#if 0 pack_t schema; - if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + if ( bli_cntx_method( cntx ) != BLIS_NAT ) { - schema = bli_cntx_schema_a_block( cntx ); + // We now ignore the pack_schema field in the control tree and + // extract the schema from the context, depending on whether we are + // preparing to pack a block of A or panel of B. For A and B, we must + // obtain the schema from the context since the induced methods reuse + // the same control trees used by native execution, and those induced + // methods specify the schema used by the current execution phase + // within the context (whereas the control tree does not change). + + if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) + { + schema = bli_cntx_schema_a_block( cntx ); + } + else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + { + schema = bli_cntx_schema_b_panel( cntx ); + } + else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + { + schema = bli_cntl_packm_params_pack_schema( cntl ); + } } - else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) + else // ( bli_cntx_method( cntx ) == BLIS_NAT ) { - schema = bli_cntx_schema_b_panel( cntx ); + // For native execution, we obtain the schema from the control tree + // node. (Notice that it doesn't matter if the pack_buf_type is for + // A or B.) + schema = bli_cntl_packm_params_pack_schema( cntl ); } - else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) + // This is no longer needed now that we branch between native and + // non-native cases above. +#if 0 + if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { // If we get a request to pack C for some reason, it is likely // not part of an induced method, and so it would be safe (and // necessary) to read the pack schema from the control tree. schema = bli_cntl_packm_params_pack_schema( cntl ); } +#endif +#endif // Prepare a few other variables based on properties of the control // tree. diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index 0ea06715a..33c64edcb 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -45,6 +45,21 @@ void bli_l3_cntl_create_if cntl_t** cntl_use ) { + // This is part of a hack to support mixed domain in bli_gemm_front(). + // Sometimes we need to specify a non-standard schema for A and B, and + // we decided to transmit them via the schema field in the obj_t's + // rather than pass them in as function parameters. Once the values + // have been read, we immediately reset them back to their expected + // values for unpacked objects. Notice that we do this even if the + // caller passed in a custom control tree; that's because we still need + // to reset the pack schema of a and b, which were modified by the + // operation's _front() function. + pack_t schema_a = bli_obj_pack_schema( a ); + pack_t schema_b = bli_obj_pack_schema( b ); + + bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); + bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); + // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) @@ -53,7 +68,7 @@ void bli_l3_cntl_create_if family == BLIS_HERK || family == BLIS_TRMM ) { - *cntl_use = bli_gemm_cntl_create( family ); + *cntl_use = bli_gemm_cntl_create( family, schema_a, schema_b ); } else // if ( family == BLIS_TRSM ) { @@ -62,7 +77,7 @@ void bli_l3_cntl_create_if if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; - *cntl_use = bli_trsm_cntl_create( side ); + *cntl_use = bli_trsm_cntl_create( side, schema_a, schema_b ); } } else diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c index 5f4bc9932..94e563c24 100644 --- a/frame/3/bli_l3_oapi.c +++ b/frame/3/bli_l3_oapi.c @@ -57,20 +57,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - /* Invoke the operation's "ind" function--its induced method front-end. - This function will call native execution for real domain problems. - For complex problems, it calls the highest priority induced method - that is available (ie: implemented and enabled), and if none are - enabled, it calls native execution. */ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( gemm ) @@ -96,16 +101,25 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_dt( b ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, beta, c, cntx ); \ + } \ } GENFRONT( hemm ) @@ -129,14 +143,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - alpha, \ - a, \ - beta, \ - c, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( c ) && \ + bli_obj_is_complex( c ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( alpha, a, beta, c, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( alpha, a, beta, c, cntx ); \ + } \ } GENFRONT( herk ) @@ -159,14 +183,24 @@ void PASTEMAC(opname,EX_SUF) \ \ BLIS_OAPI_CNTX_DECL \ \ - PASTEMAC(opname,ind) \ - ( \ - side, \ - alpha, \ - a, \ - b, \ - cntx \ - ); \ + /* Only proceed with an induced method if all operands have the same + (complex) datatype. If any datatypes differ, skip the induced method + chooser function and proceed directly with native execution, which is + where mixed datatype support will be implemented (if at all). */ \ + if ( bli_obj_dt( a ) == bli_obj_dt( b ) && \ + bli_obj_is_complex( b ) ) \ + { \ + /* Invoke the operation's "ind" function--its induced method front-end. + For complex problems, it calls the highest priority induced method + that is available (ie: implemented and enabled), and if none are + enabled, it calls native execution. (For real problems, it calls + the operation's native execution interface.) */ \ + PASTEMAC(opname,ind)( side, alpha, a, b, cntx ); \ + } \ + else \ + { \ + PASTEMAC(opname,nat)( side, alpha, a, b, cntx ); \ + } \ } GENFRONT( trmm ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b17ce10ac..3e13f23fa 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -36,17 +36,21 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { - return bli_gemmbp_cntl_create( family ); + return bli_gemmbp_cntl_create( family, schema_a, schema_b ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_gemm_ker_var2; @@ -82,7 +86,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); @@ -106,7 +110,7 @@ cntl_t* bli_gemmbp_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); @@ -134,6 +138,10 @@ cntl_t* bli_gemmbp_cntl_create // ----------------------------------------------------------------------------- +// This control tree creation function is disabled because it is no longer used. +// (It was originally created in the run up to publishing the 1m journal article, +// but was disabled to reduce complexity.) +#if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family @@ -223,6 +231,7 @@ cntl_t* bli_gemmpb_cntl_create return gemm_cntl_vl_mm; } +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 3b643e1fc..3b3cb1cf2 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -34,20 +34,26 @@ cntl_t* bli_gemm_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( - opid_t family + opid_t family, + pack_t schema_a, + pack_t schema_b ); +#if 0 cntl_t* bli_gemmpb_cntl_create ( - opid_t family + opid_t family, ); +#endif // ----------------------------------------------------------------------------- diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index f2600d791..8aae5b476 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -87,10 +87,34 @@ void bli_gemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_GEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index 2406ee1d5..b12424d63 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -88,10 +88,34 @@ void bli_hemm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HEMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HEMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index 9448b881e..15ee65fad 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -106,10 +106,38 @@ void bli_her2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HER2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HER2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bh_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bh_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 8b2379a66..f6e5b55a3 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -86,10 +86,34 @@ void bli_herk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_HERK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_HERK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &ah_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &ah_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 0c229ef9b..84263bc9d 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -87,10 +87,34 @@ void bli_symm_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYMM, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYMM, + BLIS_LEFT, // ignored for gemm/hemm/symm + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 32981cb89..769ca56a0 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -87,10 +87,38 @@ void bli_syr2k_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYR2K, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYR2K, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &bt_local ); + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &b_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &bt_local ); + bli_obj_set_pack_schema( schema_a, &b_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke herk twice, using beta only the first time. diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index eed5f0ebc..7a66ad68a 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -80,10 +80,34 @@ void bli_syrk_front } // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_SYRK, BLIS_LEFT, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_SYRK, + BLIS_LEFT, // ignored for her[2]k/syr[2]k + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &at_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &at_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index afdabbbd2..935972442 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -130,10 +130,34 @@ void bli_trmm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index 664a7fd51..0f772f0fb 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -129,10 +129,34 @@ void bli_trmm3_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRMM3, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRMM3, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index e05fc3d20..df9b831a3 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -36,16 +36,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ) { - if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create(); - else return bli_trsm_r_cntl_create(); + if ( bli_is_left( side ) ) + return bli_trsm_l_cntl_create( schema_a, schema_b ); + else + return bli_trsm_r_cntl_create( schema_a, schema_b ); } cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -79,7 +84,7 @@ cntl_t* bli_trsm_l_cntl_create TRUE, // do NOT invert diagonal TRUE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -103,7 +108,7 @@ cntl_t* bli_trsm_l_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); @@ -131,7 +136,8 @@ cntl_t* bli_trsm_l_cntl_create cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ) { void* macro_kernel_p = bli_trsm_xx_ker_var2; @@ -165,7 +171,7 @@ cntl_t* bli_trsm_r_cntl_create FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? - BLIS_PACKED_ROW_PANELS, + schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); @@ -189,7 +195,7 @@ cntl_t* bli_trsm_r_cntl_create TRUE, // do NOT invert diagonal FALSE, // reverse iteration if upper? TRUE, // reverse iteration if lower? - BLIS_PACKED_COL_PANELS, + schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h index cfd20cad3..77c36aec2 100644 --- a/frame/3/trsm/bli_trsm_cntl.h +++ b/frame/3/trsm/bli_trsm_cntl.h @@ -34,17 +34,21 @@ cntl_t* bli_trsm_cntl_create ( - side_t side + side_t side, + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_l_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); cntl_t* bli_trsm_r_cntl_create ( - void + pack_t schema_a, + pack_t schema_b ); void bli_trsm_cntl_free diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 2bc6d0186..081a2c284 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -121,10 +121,34 @@ void bli_trsm_front bli_obj_set_as_root( &c_local ); // Record the threading for each level within the context. - bli_cntx_set_thrloop_from_env( BLIS_TRSM, side, cntx, - bli_obj_length( &c_local ), - bli_obj_width( &c_local ), - bli_obj_width( &a_local ) ); + bli_cntx_set_thrloop_from_env + ( + BLIS_TRSM, + side, + bli_obj_length( &c_local ), + bli_obj_width( &c_local ), + bli_obj_width( &a_local ), + cntx + ); + + // A sort of hack for communicating the desired pach schemas for A and B + // to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and + // bli_l3_cntl_create_if()). This allows us to access the schemas from + // the control tree, which hopefully reduces some confusion, particularly + // in bli_packm_init(). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local ); + bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local ); + } + else // if ( bli_cntx_method( cntx ) != BLIS_NAT ) + { + pack_t schema_a = bli_cntx_schema_a_block( cntx ); + pack_t schema_b = bli_cntx_schema_b_panel( cntx ); + + bli_obj_set_pack_schema( schema_a, &a_local ); + bli_obj_set_pack_schema( schema_b, &b_local ); + } // Invoke the internal back-end. bli_l3_thread_decorator diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 981b01c3e..d36a20ded 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -869,10 +869,10 @@ void bli_cntx_set_thrloop_from_env ( opid_t l3_op, side_t side, - cntx_t* cntx, dim_t m, dim_t n, - dim_t k + dim_t k, + cntx_t* cntx ) { dim_t jc, pc, ic, jr, ir; @@ -988,7 +988,7 @@ void bli_cntx_set_thrloop_from_env ); } } - else // if ( l3_op == BLIS_GEMM || l3_op == BLIS_HERK ) + else // any other level-3 operation besides trmm/trsm { bli_cntx_set_thrloop ( diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index ac43312bc..4aaec97c4 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -674,12 +674,15 @@ void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); void bli_cntx_set_packm_kers( dim_t n_kers, ... ); -void bli_cntx_set_thrloop_from_env( opid_t l3_op, - side_t side, - cntx_t* cntx, - dim_t m, - dim_t n, - dim_t k ); +void bli_cntx_set_thrloop_from_env + ( + opid_t l3_op, + side_t side, + dim_t m, + dim_t n, + dim_t k, + cntx_t* cntx + ); void bli_cntx_print( cntx_t* cntx ); diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c index 131f70973..f2197597f 100644 --- a/frame/thread/bli_thrcomm_openmp.c +++ b/frame/thread/bli_thrcomm_openmp.c @@ -231,11 +231,18 @@ void bli_l3_thread_decorator { dim_t id = omp_get_thread_num(); + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread functions. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -243,17 +250,17 @@ void bli_l3_thread_decorator func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); #ifdef PRINT_THRINFO threads[id] = thread; diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c index e2fa35c35..132fb6740 100644 --- a/frame/thread/bli_thrcomm_pthreads.c +++ b/frame/thread/bli_thrcomm_pthreads.c @@ -161,11 +161,18 @@ void* bli_l3_thread_entry( void* data_void ) dim_t id = data->id; thrcomm_t* gl_comm = data->gl_comm; + obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; + // Alias thread-local copies of A, B, and C. These will be the objects + // we pass into the thread function. + bli_obj_alias_to( a, &a_t ); + bli_obj_alias_to( b, &b_t ); + bli_obj_alias_to( c, &c_t ); + // Create a default control tree for the operation, if needed. - bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use ); + bli_l3_cntl_create_if( family, &a_t, &b_t, &c_t, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( id, gl_comm, cntx, cntl_use, &thread ); @@ -173,17 +180,17 @@ void* bli_l3_thread_entry( void* data_void ) func ( alpha, - a, - b, + &a_t, + &b_t, beta, - c, + &c_t, cntx, cntl_use, thread ); // Free the control tree, if one was created locally. - bli_l3_cntl_free_if( a, b, c, cntl, cntl_use, thread ); + bli_l3_cntl_free_if( &a_t, &b_t, &c_t, cntl, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( thread ); diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c index cb0bc2ae4..068b7eda5 100644 --- a/frame/thread/bli_thrcomm_single.c +++ b/frame/thread/bli_thrcomm_single.c @@ -94,6 +94,12 @@ void bli_l3_thread_decorator cntl_t* cntl_use; thrinfo_t* thread; + // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't + // need to alias objects for A, B, and C since they were already aliased + // in bli_*_front(). (We only needed thread-local copies so each could + // safely reset their internal (beta) scalars on c after the first + // iteration of the pc (kc) loop.) + // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, a, b, c, cntl, &cntl_use );