diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index ccf88f3cb..d828f698d 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -121,11 +121,11 @@ siz_t bli_packm_init if ( pack_buf_type == BLIS_BUFFER_FOR_A_BLOCK ) { - schema = bli_cntx_get_pack_schema_a( cntx ); + schema = bli_cntx_get_pack_schema_a_block( cntx ); } else if ( pack_buf_type == BLIS_BUFFER_FOR_B_PANEL ) { - schema = bli_cntx_get_pack_schema_b( cntx ); + schema = bli_cntx_get_pack_schema_b_panel( cntx ); } else // if ( pack_buf_type == BLIS_BUFFER_FOR_C_PANEL ) { diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c index a8dfee1ba..4fe3fe7f5 100644 --- a/frame/3/bli_l3_cntl.c +++ b/frame/3/bli_l3_cntl.c @@ -70,8 +70,8 @@ void bli_l3_cntl_create_if else { // If the user provided a control tree, create a copy and use it - // instead (so that it can be used to cache things like pack mem_t - // entries). + // instead (so that threads can use its local tree as a place to + // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( cntl_orig ); } } diff --git a/frame/3/bli_l3_cntx.c b/frame/3/bli_l3_cntx.c index 8b4b01572..161e68160 100644 --- a/frame/3/bli_l3_cntx.c +++ b/frame/3/bli_l3_cntx.c @@ -63,9 +63,8 @@ void bli_gemm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_gemm_cntx_finalize( cntx_t* cntx ) @@ -106,9 +105,8 @@ void bli_trsm_cntx_init( num_t dt, cntx_t* cntx ) cntx ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS, - BLIS_PACKED_COL_PANELS, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx ); } void bli_trsm_cntx_finalize( cntx_t* cntx ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index b3494b174..775ca2544 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -39,8 +39,17 @@ cntl_t* bli_gemm_cntl_create opid_t family ) { - void* macro_kernel_p = bli_gemm_ker_var2; + return bli_gemmbp_cntl_create( family ); +} +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var2; // Change the macro-kernel if the operation family is herk or trmm. if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; @@ -64,7 +73,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create ( - bli_gemm_packa, + bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, BLIS_MR, BLIS_KR, @@ -87,7 +96,7 @@ cntl_t* bli_gemm_cntl_create // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create ( - bli_gemm_packb, + bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, BLIS_KR, BLIS_NR, @@ -118,6 +127,95 @@ cntl_t* bli_gemm_cntl_create return gemm_cntl_vl_mm; } +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ) +{ + void* macro_kernel_p = bli_gemm_ker_var1; + + // Change the macro-kernel if the operation family is herk or trmm. + //if ( family == BLIS_HERK ) macro_kernel_p = bli_herk_x_ker_var2; + //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; + + // Create two nodes for the macro-kernel. + cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_obj_create + ( + BLIS_MR, // needed for bli_thrinfo_rgrow() + NULL, // variant function pointer not used + NULL // no sub-node; this is the leaf of the tree. + ); + + cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_obj_create + ( + BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() + macro_kernel_p, + gemm_cntl_ub_ke + ); + + // Create a node for packing matrix A (which is really the right-hand + // operand "B"). + cntl_t* gemm_cntl_packb = bli_packm_cntl_obj_create + ( + bli_gemm_packb, // pack the right-hand operand + bli_packm_blk_var1, + BLIS_KR, + BLIS_MR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_COL_PANELS, + BLIS_BUFFER_FOR_A_BLOCK, + gemm_cntl_pb_ub + ); + + // Create a node for partitioning the n dimension by MC. + cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_obj_create + ( + BLIS_MC, + bli_gemm_blk_var2, + gemm_cntl_packb + ); + + // Create a node for packing matrix B (which is really the left-hand + // operand "A"). + cntl_t* gemm_cntl_packa = bli_packm_cntl_obj_create + ( + bli_gemm_packa, // pack the left-hand operand + bli_packm_blk_var1, + BLIS_NR, + BLIS_KR, + FALSE, // do NOT invert diagonal + FALSE, // reverse iteration if upper? + FALSE, // reverse iteration if lower? + BLIS_PACKED_ROW_PANELS, + BLIS_BUFFER_FOR_B_PANEL, + gemm_cntl_op_pb + ); + + // Create a node for partitioning the k dimension by KC. + cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_obj_create + ( + BLIS_KC, + bli_gemm_blk_var3, + gemm_cntl_packa + ); + + // Create a node for partitioning the m dimension by NC. + cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_obj_create + ( + BLIS_NC, + bli_gemm_blk_var1, + gemm_cntl_mm_op + ); + + return gemm_cntl_vl_mm; +} + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h index 5b985327c..6da6cd768 100644 --- a/frame/3/gemm/bli_gemm_cntl.h +++ b/frame/3/gemm/bli_gemm_cntl.h @@ -37,6 +37,20 @@ cntl_t* bli_gemm_cntl_create opid_t family ); +// ----------------------------------------------------------------------------- + +cntl_t* bli_gemmbp_cntl_create + ( + opid_t family + ); + +cntl_t* bli_gemmpb_cntl_create + ( + opid_t family + ); + +// ----------------------------------------------------------------------------- + void bli_gemm_cntl_free ( cntl_t* cntl, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index ad645411e..850e2510c 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -46,11 +46,10 @@ void bli_gemm_front cntl_t* cntl ) { + #ifdef BLIS_SMALL_MATRIX_ENABLE -#ifndef BLIS_ENABLE_MULTITHREADING gint_t status = bli_gemm_small_matrix(alpha, a, b, beta, c, cntx, cntl); if(BLIS_SUCCESS != status) -#endif #endif { obj_t a_local; @@ -90,9 +89,6 @@ void bli_gemm_front bli_obj_induce_trans( c_local ); } - // Set the operation family id in the context. - bli_cntx_set_family( BLIS_GEMM, cntx ); - // Record the threading for each level within the context. bli_cntx_set_thrloop_from_env( BLIS_GEMM, BLIS_LEFT, cntx, bli_obj_length( c_local ), @@ -103,6 +99,7 @@ void bli_gemm_front bli_l3_thread_decorator ( bli_gemm_int, + BLIS_GEMM, // operation family id alpha, &a_local, &b_local, diff --git a/frame/3/gemm/bli_gemm_ker_var1.c b/frame/3/gemm/bli_gemm_ker_var1.c new file mode 100644 index 000000000..7b485a6b7 --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var1.c @@ -0,0 +1,56 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_gemm_ker_var1 + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + // Implement _ker_var1() in terms of _ker_var2() by transposing the + // entire suboperation (which also requires swapping A and B). + + bli_obj_induce_trans( *a ); + bli_obj_induce_trans( *b ); + bli_obj_induce_trans( *c ); + + bli_gemm_ker_var2( b, a, c, cntx, cntl, thread ); +} + diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index c66587fda..88412c3d8 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -56,6 +56,7 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_packa ) GENPROT( gemm_packb ) +GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // Headers for induced algorithms: diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c index 2b45a5de3..cac290da9 100644 --- a/frame/base/bli_cntl.c +++ b/frame/base/bli_cntl.c @@ -97,6 +97,16 @@ void bli_cntl_free cntl_t* cntl, thrinfo_t* thread ) +{ + if ( thread != NULL ) bli_cntl_free_w_thrinfo( cntl, thread ); + else bli_cntl_free_wo_thrinfo( cntl ); +} + +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; @@ -112,7 +122,7 @@ void bli_cntl_free { // Recursively free all memory associated with the sub-node and its // children. - bli_cntl_free( cntl_sub_node, thread_sub_node ); + bli_cntl_free_w_thrinfo( cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. @@ -122,8 +132,8 @@ void bli_cntl_free } // Release the current node's pack mem_t entry back to the memory - // broker from which it originated, but only if the current thread - // is chief for its group, and only if the mem_t is allocated. + // broker from which it originated, but only if the mem_t entry is + // allocated, and only if the current thread is chief for its group. if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { @@ -134,6 +144,42 @@ void bli_cntl_free bli_cntl_obj_free( cntl ); } +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ) +{ + // Base case: simply return when asked to free NULL nodes. + if ( cntl == NULL ) return; + + cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); + void* cntl_params = bli_cntl_params( cntl ); + mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); + + { + // Recursively free all memory associated with the sub-node and its + // children. + bli_cntl_free_wo_thrinfo( cntl_sub_node ); + } + + // Free the current node's params field, if it is non-NULL. + if ( cntl_params != NULL ) + { + bli_free_intl( cntl_params ); + } + + // Release the current node's pack mem_t entry back to the memory + // broker from which it originated, but only if the mem_t entry is + // allocated. + if ( bli_mem_is_alloc( cntl_pack_mem ) ) + { + bli_membrk_release( cntl_pack_mem ); + } + + // Free the current node. + bli_cntl_obj_free( cntl ); +} + // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h index 7b6000bb9..fd0413f4f 100644 --- a/frame/base/bli_cntl.h +++ b/frame/base/bli_cntl.h @@ -75,12 +75,25 @@ void bli_cntl_obj_clear cntl_t* cntl ); +// ----------------------------------------------------------------------------- + void bli_cntl_free ( cntl_t* cntl, thrinfo_t* thread ); +void bli_cntl_free_w_thrinfo + ( + cntl_t* cntl, + thrinfo_t* thread + ); + +void bli_cntl_free_wo_thrinfo + ( + cntl_t* cntl + ); + cntl_t* bli_cntl_copy ( cntl_t* cntl diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index e4299eb49..f8cdf1fc4 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -330,14 +330,24 @@ ind_t bli_cntx_get_ind_method( cntx_t* cntx ) return bli_cntx_method( cntx ); } -pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ) { - return bli_cntx_schema_a( cntx ); + return bli_cntx_schema_a_block( cntx ); } -pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ) +pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ) { - return bli_cntx_schema_b( cntx ); + return bli_cntx_schema_b_panel( cntx ); +} + +pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ) +{ + return bli_cntx_schema_c_panel( cntx ); +} + +bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ) +{ + return bli_cntx_anti_pref( cntx ); } #endif @@ -705,31 +715,39 @@ void bli_cntx_set_ind_method( ind_t method, bli_cntx_set_method( method, cntx ); } -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ) { - bli_cntx_set_schema_a( schema_a, cntx ); + bli_cntx_set_schema_a_block( schema_a, cntx ); } -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ) +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ) { - bli_cntx_set_schema_b( schema_b, cntx ); + bli_cntx_set_schema_b_panel( schema_b, cntx ); } -void bli_cntx_set_pack_schema_c( pack_t schema_c, +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ) +{ + bli_cntx_set_schema_c_panel( schema_c, cntx ); +} + +#if 0 +void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, cntx_t* cntx ) { - bli_cntx_set_schema_c( schema_c, cntx ); + bli_cntx_set_anti_pref( anti_pref, cntx ); } +#endif void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, dim_t m, dim_t n, dim_t k ) @@ -904,6 +922,32 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_nat_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +// ----------------------------------------------------------------------------- + bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) @@ -953,6 +997,30 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_prefers_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + bool_t r_val = bli_cntx_l3_ukr_dislikes_storage_of( obj, ukr_id, cntx ); + + // If the anti-preference is set, negate the result. + if ( bli_cntx_anti_pref( cntx ) ) r_val = !r_val; + + return r_val; +} + // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 9c97c3312..a76cdd329 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -59,6 +59,8 @@ typedef struct cntx_s pack_t schema_b; pack_t schema_c; + bool_t anti_pref; + dim_t* thrloop; membrk_t* membrk; @@ -113,26 +115,30 @@ typedef struct cntx_s \ ( (cntx)->method ) -#define bli_cntx_schema_a( cntx ) \ +#define bli_cntx_schema_a_block( cntx ) \ \ - ( (cntx)->schema_a ) + ( (cntx)->schema_a_block ) -#define bli_cntx_schema_b( cntx ) \ +#define bli_cntx_schema_b_panel( cntx ) \ \ - ( (cntx)->schema_b ) + ( (cntx)->schema_b_panel ) -#define bli_cntx_schema_c( cntx ) \ +#define bli_cntx_schema_c_panel( cntx ) \ \ - ( (cntx)->schema_c ) + ( (cntx)->schema_c_panel ) -#define bli_cntx_membrk( cntx ) \ +#define bli_cntx_anti_pref( cntx ) \ \ - ( (cntx)->membrk ) + ( (cntx)->anti_pref ) #define bli_cntx_thrloop( cntx ) \ \ ( (cntx)->thrloop ) +#define bli_cntx_membrk( cntx ) \ +\ + ( (cntx)->membrk ) + #if 1 #define bli_cntx_jc_way( cntx ) \ \ @@ -211,24 +217,24 @@ typedef struct cntx_s (cntx_p)->method = _method; \ } -#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \ +#define bli_cntx_set_schema_a_block( _schema_a_block, cntx_p ) \ { \ - (cntx_p)->schema_a = _schema_a; \ + (cntx_p)->schema_a_block = _schema_a_block; \ } -#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \ +#define bli_cntx_set_schema_b_panel( _schema_b_panel, cntx_p ) \ { \ - (cntx_p)->schema_b = _schema_b; \ + (cntx_p)->schema_b_panel = _schema_b_panel; \ } -#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \ +#define bli_cntx_set_schema_c_panel( _schema_c_panel, cntx_p ) \ { \ - (cntx_p)->schema_c = _schema_c; \ + (cntx_p)->schema_c_panel = _schema_c_panel; \ } -#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +#define bli_cntx_set_anti_pref( _anti_pref, cntx_p ) \ { \ - (cntx_p)->membrk = _membrk; \ + (cntx_p)->anti_pref = _anti_pref; \ } #define bli_cntx_set_thrloop( jc_, pc_, ic_, jr_, ir_, cntx_p ) \ @@ -241,6 +247,11 @@ typedef struct cntx_s (cntx_p)->thrloop[ BLIS_KR ] = 1; \ } +#define bli_cntx_set_membrk( _membrk, cntx_p ) \ +{ \ + (cntx_p)->membrk = _membrk; \ +} + // cntx_t query (complex) #define bli_cntx_get_blksz_def_dt( dt, bs_id, cntx ) \ @@ -323,13 +334,17 @@ typedef struct cntx_s \ bli_cntx_method( cntx ) -#define bli_cntx_get_pack_schema_a( cntx ) \ +#define bli_cntx_get_pack_schema_a_block( cntx ) \ \ - bli_cntx_schema_a( cntx ) + bli_cntx_schema_a_block( cntx ) -#define bli_cntx_get_pack_schema_b( cntx ) \ +#define bli_cntx_get_pack_schema_b_panel( cntx ) \ \ - bli_cntx_schema_b( cntx ) + bli_cntx_schema_b_panel( cntx ) + +#define bli_cntx_get_pack_schema_c_panel( cntx ) \ +\ + bli_cntx_schema_c_panel( cntx ) #define bli_cntx_get_membrk( cntx ) \ \ @@ -395,9 +410,10 @@ func_t* bli_cntx_get_packm_ukr( cntx_t* cntx ); // l1vkr_t ker_id, // cntx_t* cntx ); //ind_t bli_cntx_get_ind_method( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_a( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_b( cntx_t* cntx ); -//pack_t bli_cntx_get_pack_schema_c( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_a_block( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_b_panel( cntx_t* cntx ); +//pack_t bli_cntx_get_pack_schema_c_panel( cntx_t* cntx ); +//bool_t bli_cntx_get_ukr_anti_pref( cntx_t* cntx ); dim_t bli_cntx_get_num_threads( cntx_t* cntx ); dim_t bli_cntx_get_num_threads_in( cntx_t* cntx, cntl_t* cntl ); @@ -425,15 +441,17 @@ void bli_cntx_set_packm_ukr( func_t* func, cntx_t* cntx ); void bli_cntx_set_ind_method( ind_t method, cntx_t* cntx ); -void bli_cntx_set_pack_schema_ab( pack_t schema_a, - pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_a( pack_t schema_a, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_b( pack_t schema_b, - cntx_t* cntx ); -void bli_cntx_set_pack_schema_c( pack_t schema_c, - cntx_t* cntx ); +void bli_cntx_set_pack_schema_ab_blockpanel( pack_t schema_a, + pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_a_block( pack_t schema_a, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_b_panel( pack_t schema_b, + cntx_t* cntx ); +void bli_cntx_set_pack_schema_c_panel( pack_t schema_c, + cntx_t* cntx ); +//void bli_cntx_set_ukr_anti_pref( bool_t anti_pref, +// cntx_t* cntx ); void bli_cntx_set_thrloop_from_env( opid_t l3_op, side_t side, cntx_t* cntx, @@ -455,6 +473,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_nat_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); @@ -467,6 +491,12 @@ bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_prefers_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_eff_dislikes_storage_of( obj_t* obj, + l3ukr_t ukr_id, + cntx_t* cntx ); // print function diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 32f99a832..2ada1556e 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -606,6 +606,9 @@ void bli_gks_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr, mbool_t* cntx_l3_nat_ukr_pref = &cntx_l3_nat_ukr_prefs[ ukr ]; bli_gks_get_l3_nat_ukr_prefs( ukr, cntx_l3_nat_ukr_pref ); + + // Explicitly set the anti-preference to FALSE. + bli_cntx_set_anti_pref( FALSE, cntx ); } diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h index 0d5992900..a7a69243e 100644 --- a/frame/include/bli_obj_macro_defs.h +++ b/frame/include/bli_obj_macro_defs.h @@ -877,6 +877,12 @@ bli_obj_width_stored( obj ) (obj).n_panel = n0; \ } +#define bli_obj_set_panel_dims( m0, n0, obj ) \ +{ \ + bli_obj_set_panel_length( m0, obj ); \ + bli_obj_set_panel_width( n0, obj ); \ +} + #define bli_obj_set_panel_dim( panel_dim, obj ) \ { \ (obj).pd = panel_dim; \ @@ -985,6 +991,7 @@ bli_obj_width_stored( obj ) #define bli_obj_induce_trans( obj ) \ { \ { \ + /* Induce transposition among basic fields. */ \ dim_t m_ = bli_obj_length( obj ); \ dim_t n_ = bli_obj_width( obj ); \ inc_t rs_ = bli_obj_row_stride( obj ); \ @@ -1000,6 +1007,15 @@ bli_obj_width_stored( obj ) \ if ( bli_obj_is_upper_or_lower( obj ) ) \ bli_obj_toggle_uplo( obj ); \ +\ + /* Induce transposition among packed fields. */ \ + dim_t m_padded_ = bli_obj_padded_length( obj ); \ + dim_t n_padded_ = bli_obj_padded_width( obj ); \ + dim_t m_panel_ = bli_obj_panel_length( obj ); \ + dim_t n_panel_ = bli_obj_panel_width( obj ); \ +\ + bli_obj_set_padded_dims( n_padded_, m_padded_, obj ); \ + bli_obj_set_panel_dims( n_panel_, m_panel_, obj ); \ \ /* Note that this macro DOES NOT touch the transposition bit! If the calling code is using this macro to handle an object whose diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c4cfd3514..1a120d5da 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -975,9 +975,11 @@ typedef struct cntx_s opid_t family; ind_t method; - pack_t schema_a; - pack_t schema_b; - pack_t schema_c; + pack_t schema_a_block; + pack_t schema_b_panel; + pack_t schema_c_panel; + + bool_t anti_pref; dim_t thrloop[ BLIS_NUM_LOOPS ]; diff --git a/frame/ind/cntx/bli_gemmind_cntx.c b/frame/ind/cntx/bli_gemmind_cntx.c index ce40bb105..5b7a70c3c 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.c +++ b/frame/ind/cntx/bli_gemmind_cntx.c @@ -151,9 +151,8 @@ void bli_gemm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -200,9 +199,8 @@ void bli_gemm3m2_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MS, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_3MS, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx ); } void bli_gemm3m2_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -249,9 +247,8 @@ void bli_gemm3m3_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - BLIS_PACKED_COL_PANELS_3MS, - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_3MS, cntx ); } void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -259,15 +256,15 @@ void bli_gemm3m3_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_a( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); } } @@ -311,9 +308,8 @@ void bli_gemm3mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -321,18 +317,18 @@ void bli_gemm3mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RPI, - BLIS_PACKED_COL_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RPI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RPI, cntx ); } } @@ -376,9 +372,8 @@ void bli_gemm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -425,9 +420,8 @@ void bli_gemm4mb_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx ); } void bli_gemm4mb_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -474,9 +468,8 @@ void bli_gemm4mh_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( 0, // not yet needed; varies with _stage() - 0, // not yet needed; varies with _stage() - cntx ); + bli_cntx_set_pack_schema_a_block( 0, cntx ); // not yet needed; varies with _stage() + bli_cntx_set_pack_schema_b_panel( 0, cntx ); // not yet needed; varies with _stage() } void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) @@ -484,23 +477,23 @@ void bli_gemm4mh_cntx_stage( dim_t stage, cntx_t* cntx ) // Set the pack_t schemas as a function of the stage of execution. if ( stage == 0 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } else if ( stage == 1 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else if ( stage == 2 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_RO, - BLIS_PACKED_COL_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_IO, cntx ); } else // if ( stage == 3 ) { - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_IO, - BLIS_PACKED_COL_PANELS_RO, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_IO, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_RO, cntx ); } } @@ -511,6 +504,22 @@ void bli_gemm4mh_cntx_finalize( cntx_t* cntx ) // ----------------------------------------------------------------------------- void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) +{ + // Default to context for block-panel algorithm. + bli_gemm1mbp_cntx_init( dt, cntx ); +} + +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, FALSE, cntx ); +} + +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ) +{ + bli_gemm1mxx_cntx_init( dt, TRUE, cntx ); +} + +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ) { const ind_t method = BLIS_1M; @@ -529,8 +538,24 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) // Initialize the context with packm-related kernels. bli_packm_cntx_init( dt, cntx ); + // Initialize the blocksizes according to the micro-kernel preference as + // well as the algorithm. if ( bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_c_bp, 1m_r_pb. + + // Set the pack_t schemas for the c_bp or r_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -544,14 +569,23 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { + // This branch is used for algorithms 1m_r_bp, 1m_c_pb. + + // Set the pack_t schemas for the r_bp or c_pb algorithms. + if ( !is_pb ) + { + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx ); + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx ); + } + else // if ( is_pb ) + { + bli_cntx_set_pack_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx ); + bli_cntx_set_pack_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx ); + } + // Initialize the context with the current architecture's register // and cache blocksizes (and multiples), and the induced method. bli_gks_cntx_set_blkszs @@ -565,12 +599,15 @@ void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ) BLIS_KR, BLIS_KR, 1.0, 1.0, cntx ); - - // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); } + + // Set the anti-preference field to TRUE when executing a panel-block + // algorithm, and FALSE otherwise. This will cause higher-level generic + // code to establish (if needed) disagreement between the storage of C and + // the micro-kernel output preference so that the two will come back into + // agreement in the panel-block macro-kernel (which implemented in terms + // of the block-panel macro-kernel with some induced transpositions). + bli_cntx_set_anti_pref( is_pb, cntx ); } void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ) diff --git a/frame/ind/cntx/bli_gemmind_cntx.h b/frame/ind/cntx/bli_gemmind_cntx.h index f49744c3f..ea47968b1 100644 --- a/frame/ind/cntx/bli_gemmind_cntx.h +++ b/frame/ind/cntx/bli_gemmind_cntx.h @@ -65,6 +65,9 @@ void bli_gemm4m1_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm4m1_cntx_finalize( cntx_t* cntx ); void bli_gemm1m_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mbp_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mpb_cntx_init( num_t dt, cntx_t* cntx ); +void bli_gemm1mxx_cntx_init( num_t dt, bool_t is_pb, cntx_t* cntx ); void bli_gemm1m_cntx_stage( dim_t stage, cntx_t* cntx ); void bli_gemm1m_cntx_finalize( cntx_t* cntx ); diff --git a/frame/ind/cntx/bli_trsmind_cntx.c b/frame/ind/cntx/bli_trsmind_cntx.c index 4cb0bf6ba..a13d0d05a 100644 --- a/frame/ind/cntx/bli_trsmind_cntx.c +++ b/frame/ind/cntx/bli_trsmind_cntx.c @@ -74,9 +74,9 @@ void bli_trsm3m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_3MI, - BLIS_PACKED_COL_PANELS_3MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_3MI, + BLIS_PACKED_COL_PANELS_3MI, + cntx ); } void bli_trsm3m1_cntx_finalize( cntx_t* cntx ) @@ -123,9 +123,9 @@ void bli_trsm4m1_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for native execution. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_4MI, - BLIS_PACKED_COL_PANELS_4MI, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_4MI, + BLIS_PACKED_COL_PANELS_4MI, + cntx ); } void bli_trsm4m1_cntx_finalize( cntx_t* cntx ) @@ -174,9 +174,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1E, - BLIS_PACKED_COL_PANELS_1R, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1E, + BLIS_PACKED_COL_PANELS_1R, + cntx ); } else // if ( bli_cntx_l3_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { @@ -195,9 +195,9 @@ void bli_trsm1m_cntx_init( num_t dt, cntx_t* cntx ) ); // Set the pack_t schemas for the current induced method. - bli_cntx_set_pack_schema_ab( BLIS_PACKED_ROW_PANELS_1R, - BLIS_PACKED_COL_PANELS_1E, - cntx ); + bli_cntx_set_pack_schema_ab_blockpanel( BLIS_PACKED_ROW_PANELS_1R, + BLIS_PACKED_COL_PANELS_1E, + cntx ); } } diff --git a/frame/ind/oapi/bli_l3_1mbppb_oapi.c b/frame/ind/oapi/bli_l3_1mbppb_oapi.c new file mode 100644 index 000000000..e91f27ea2 --- /dev/null +++ b/frame/ind/oapi/bli_l3_1mbppb_oapi.c @@ -0,0 +1,85 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// -- gemmbp/gemmpb ------------------------------------------------------------ + +#undef GENFRONT +#define GENFRONT( opname, imeth, alg ) \ +\ +void PASTEMAC2(opname,imeth,alg) \ + ( \ + obj_t* alpha, \ + obj_t* a, \ + obj_t* b, \ + obj_t* beta, \ + obj_t* c \ + ) \ +{ \ + num_t dt = bli_obj_datatype( *c ); \ + cntx_t cntx; \ + cntl_t* cntl_p; \ +\ + /* If the objects are in the real domain, execute the native + implementation. */ \ + if ( bli_obj_is_real( *c ) ) \ + { \ + PASTEMAC(opname,nat)( alpha, a, b, beta, c, NULL ); \ + return; \ + } \ +\ + /* Initialize a local 1m context for the current algorithm (bp or pb). */ \ + PASTEMAC3(opname,imeth,alg,_cntx_init)( dt, &cntx ); \ +\ + /* Create a control tree for the current algorithm (bp or pb). */ \ + cntl_p = PASTEMAC2(opname,alg,_cntl_create)( BLIS_GEMM ); \ +\ + /* Invoke the operation's front end using the context and control + tree we just created. */ \ + PASTEMAC(opname,_front)( alpha, a, b, beta, c, &cntx, cntl_p ); \ +\ + /* Free the control tree. Since the implementation will only make + copies of it (and not use it directly) we do not need to supply + a thread object. */ \ + bli_cntl_free( cntl_p, NULL ); \ +\ + /* Finalize the local context. */ \ + PASTEMAC2(opname,imeth,_cntx_finalize)( &cntx ); \ +} + +// gemm +GENFRONT( gemm, 1m, bp ) +GENFRONT( gemm, 1m, pb ) + diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index cb966d71c..36281f543 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -62,6 +62,14 @@ void PASTEMAC(opname,imeth) \ PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx ); \ return; \ } \ +\ + /* A temporary hack to easily specify the 1m algorithm (block-panel or + panel-block). */ \ + if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ + { \ + bli_gemm1mbp( alpha, a, b, beta, c ); \ + return; \ + } \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/frame/ind/oapi/bli_l3_ind_oapi.h b/frame/ind/oapi/bli_l3_ind_oapi.h index 7f8ae194c..f5907d414 100644 --- a/frame/ind/oapi/bli_l3_ind_oapi.h +++ b/frame/ind/oapi/bli_l3_ind_oapi.h @@ -80,3 +80,17 @@ GENPROT_NO2OP( 3m2 ) GENPROT_NO2OP( 4mh ) GENPROT_NO2OP( 4mb ) + +// +// Generate object-based prototypes for 1m methods that specify an algorithm +// (e.g., block-panel or panel-block). +// + +#undef GENPROT +#define GENPROT( imeth, alg ) \ +\ +void PASTEMAC2(gemm,imeth,alg) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c ); \ + +GENPROT( 1m, bp ) +GENPROT( 1m, pb ) + diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c index f686aa7ac..ff23a36f4 100644 --- a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c @@ -55,7 +55,7 @@ void PASTEMAC(ch,varname) \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ - const bool_t row_pref = !col_pref; \ + /*const bool_t row_pref = !col_pref;*/ \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ @@ -77,10 +77,8 @@ void PASTEMAC(ch,varname) \ ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ - const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ - const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ -\ - ctype_r beta_use; \ + ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ + ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ \ ctype_r* c_use; \ inc_t rs_c_use; \ @@ -96,75 +94,71 @@ void PASTEMAC(ch,varname) \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ \ /* If beta has a non-zero imaginary component OR if c is stored with - general stride OR if for some reason the storage of c is not the - preferred storage of the micro-kernel, then we compute the - alpha*a*b product into temporary storage and then accumulate that - result into c afterwards. */ \ - if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ - else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ - else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ \ if ( using_ct ) \ { \ + /* In the atypical cases, we compute the result into temporary + workspace ct and then accumulated it back to c at the end. */ \ +\ /* Set the strides of ct based on the preference of the underlying native real domain gemm micro-kernel. Note that we set the ct strides in units of complex elements. */ \ if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ else { rs_ct = nr; cs_ct = 1; } \ \ - beta_use = *zero_r; \ c_use = ( ctype_r* )ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ - } \ - else \ - { \ - /* In a typical case, we use the real part of beta and accumulate - directly into the output matrix c. */ \ - beta_use = beta_r; \ - c_use = ( ctype_r* )c; \ - rs_c_use = rs_c; \ - cs_c_use = cs_c; \ - } \ \ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ \ - /* Convert the strides from being in units of complex elements to - be in units of real elements. Note that we don't need to check for - general storage here because that case corresponds to the scenario - where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ - if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ - else rs_c_use *= 2; \ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ \ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + zero_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ \ - /* The following gemm micro-kernel call implements the 1m method, - which induces a complex matrix multiplication by calling the - real matrix micro-kernel on micro-panels that have been packed - according to the 1e and 1r formats. */ \ -\ - /* c = beta * c + alpha_r * a * b; */ \ - rgemm_ukr \ - ( \ - k2, \ - alpha_r, \ - a_r, \ - b_r, \ - &beta_use, \ - c_use, rs_c_use, cs_c_use, \ - data, \ - cntx \ - ); \ -\ -\ - /* If necessary, accumulate the final result in ct back to c. */ \ - if ( using_ct ) \ - { \ dim_t i, j; \ \ + /* Accumulate the final result in ct back to c. */ \ for ( j = 0; j < nr; ++j ) \ for ( i = 0; i < mr; ++i ) \ { \ @@ -173,6 +167,40 @@ void PASTEMAC(ch,varname) \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ + else \ + { \ + /* In the typical cases, we use the real part of beta and + accumulate directly into the output matrix c. */ \ +\ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + beta_r, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ + } \ } INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) diff --git a/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev new file mode 100644 index 000000000..3760bdd7c --- /dev/null +++ b/frame/ind/ukernels/gemm/bli_gemm1m_ukr_ref.c.prev @@ -0,0 +1,188 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#undef GENTFUNCCO +#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmkerid ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + dim_t k, \ + ctype* restrict alpha, \ + ctype* restrict a, \ + ctype* restrict b, \ + ctype* restrict beta, \ + ctype* restrict c, inc_t rs_c, inc_t cs_c, \ + auxinfo_t* restrict data, \ + cntx_t* restrict cntx \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ + const num_t dt_r = PASTEMAC(chr,type); \ +\ + PASTECH(chr,gemm_ukr_ft) \ + rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, gemmkerid, cntx ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + /*const bool_t row_pref = !col_pref;*/ \ +\ + const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ +\ + const dim_t k2 = 2 * k; \ +\ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype_r ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + inc_t rs_ct; \ + inc_t cs_ct; \ +\ + ctype_r* restrict a_r = ( ctype_r* )a; \ +\ + ctype_r* restrict b_r = ( ctype_r* )b; \ +\ + ctype_r* restrict zero_r = PASTEMAC(chr,0); \ +\ + ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ + ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ +\ + const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \ + const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \ +\ + ctype_r beta_use; \ +\ + ctype_r* c_use; \ + inc_t rs_c_use; \ + inc_t cs_c_use; \ +\ + bool_t using_ct; \ +\ +\ + /* SAFETY CHECK: The higher level implementation should never + allow an alpha with non-zero imaginary component to be passed + in, because it can't be applied properly using the 1m method. + If alpha is not real, then something is very wrong. */ \ + if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ +\ +\ + /* Sanity check: These should never occur because storage/preference + agreement is handled at a higher level. */ \ + /* + if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) bli_abort(); \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) bli_abort(); \ + */ \ +\ +\ + /* If beta has a non-zero imaginary component OR if c is stored with + general stride, then we compute the alpha*a*b product into temporary + storage and then accumulate that result into c afterwards. Note that + the other two cases concerning disagreement between the storage of C + and the output preference of the micro-kernel, should never occur + (though we could handle them if they did occur). */ \ + if ( !PASTEMAC(chr,eq0)( beta_i ) ) using_ct = TRUE; \ + /*else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ + else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE;*/ \ + else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ + else using_ct = FALSE; \ +\ +\ + if ( using_ct ) \ + { \ + /* Set the strides of ct based on the preference of the underlying + native real domain gemm micro-kernel. Note that we set the ct + strides in units of complex elements. */ \ + if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ + else { rs_ct = nr; cs_ct = 1; } \ +\ + beta_use = *zero_r; \ + c_use = ( ctype_r* )ct; \ + rs_c_use = rs_ct; \ + cs_c_use = cs_ct; \ + } \ + else \ + { \ + /* In a typical case, we use the real part of beta and accumulate + directly into the output matrix c. */ \ + beta_use = beta_r; \ + c_use = ( ctype_r* )c; \ + rs_c_use = rs_c; \ + cs_c_use = cs_c; \ + } \ +\ +\ + /* Convert the strides from being in units of complex elements to + be in units of real elements. Note that we don't need to check for + general storage here because that case corresponds to the scenario + where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ + if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ + else rs_c_use *= 2; \ +\ +\ + /* The following gemm micro-kernel call implements the 1m method, + which induces a complex matrix multiplication by calling the + real matrix micro-kernel on micro-panels that have been packed + according to the 1e and 1r formats. */ \ +\ + /* c = beta * c + alpha_r * a * b; */ \ + rgemm_ukr \ + ( \ + k2, \ + alpha_r, \ + a_r, \ + b_r, \ + &beta_use, \ + c_use, rs_c_use, cs_c_use, \ + data, \ + cntx \ + ); \ +\ +\ + /* If necessary, accumulate the final result in ct back to c. */ \ + if ( using_ct ) \ + { \ + dim_t i, j; \ +\ + for ( j = 0; j < nr; ++j ) \ + for ( i = 0; i < mr; ++i ) \ + { \ + PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ + *beta, \ + *(c + i*rs_c + j*cs_c ) ); \ + } \ + } \ +} + +INSERT_GENTFUNCCO_BASIC( gemm1m_ukr_ref, BLIS_GEMM_UKR ) + diff --git a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c index 7d746304c..c4ec44b54 100644 --- a/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_gemmtrsm1m_ukr_ref.c @@ -78,7 +78,7 @@ void PASTEMAC(ch,varname) \ \ const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ const dim_t k2 = 2 * k; \ \ diff --git a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c index 92da659ca..ab5617795 100644 --- a/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c +++ b/frame/ind/ukernels/trsm/bli_trsm1m_ukr_ref.c @@ -67,7 +67,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ @@ -273,7 +273,7 @@ void PASTEMAC(ch,varname) \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ - const pack_t schema_b = bli_cntx_schema_b( cntx ); \ + const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \