diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c index a01a88ce4..05dd16d53 100644 --- a/frame/1m/packm/bli_packm_init.c +++ b/frame/1m/packm/bli_packm_init.c @@ -34,8 +34,6 @@ #include "blis.h" -extern blksz_t* gemm_upanel_align; - void bli_packm_init( obj_t* a, obj_t* p, packm_t* cntl ) @@ -161,6 +159,9 @@ void bli_packm_init( obj_t* a, } +extern blksz_t* gemm_upanel_a_align; +extern blksz_t* gemm_upanel_b_align; + void bli_packm_init_pack( invdiag_t invert_diag, pack_t pack_schema, packord_t pack_ord_if_up, @@ -309,7 +310,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, { dim_t m_panel; dim_t ps_p; - dim_t upanel_align; + dim_t upanel_a_align; // The panel dimension (for each datatype) should be equal to the // register blocksize in the m dimension. @@ -333,8 +334,8 @@ void bli_packm_init_pack( invdiag_t invert_diag, // dimension of the matrix is not a whole multiple of MR. ps_p = cs_p * n_p_pad; - // Query the micro-panel alignment. - upanel_align = bli_blksz_for_type( dt, gemm_upanel_align ); + // Query the micro-panel alignment for A. + upanel_a_align = bli_blksz_for_type( dt, gemm_upanel_a_align ); // Here, we adjust the panel stride, if necessary. Remember: ps_p is // always interpreted as being in units of the datatype of the object @@ -348,21 +349,21 @@ void bli_packm_init_pack( invdiag_t invert_diag, ps_p = ( ps_p * 3 ) / 2; // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); } else if ( bli_is_ro_packed( pack_schema ) || bli_is_io_packed( pack_schema ) || bli_is_rpi_packed( pack_schema ) ) { // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); ps_p = ps_p / 2; } else { // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_a_align ); } // Store the strides and panel dimension in p. @@ -380,7 +381,7 @@ void bli_packm_init_pack( invdiag_t invert_diag, { dim_t n_panel; dim_t ps_p; - dim_t upanel_align; + dim_t upanel_b_align; // The panel dimension (for each datatype) should be equal to the // register blocksize in the n dimension. @@ -404,8 +405,8 @@ void bli_packm_init_pack( invdiag_t invert_diag, // dimension of the matrix is not a whole multiple of NR. ps_p = m_p_pad * rs_p; - // Query the micro-panel alignment. - upanel_align = bli_blksz_for_type( dt, gemm_upanel_align ); + // Query the micro-panel alignment for B. + upanel_b_align = bli_blksz_for_type( dt, gemm_upanel_b_align ); // Here, we adjust the panel stride, if necessary. Remember: ps_p is // always interpreted as being in units of the datatype of the object @@ -419,21 +420,21 @@ void bli_packm_init_pack( invdiag_t invert_diag, ps_p = ( ps_p * 3 ) / 2; // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); } else if ( bli_is_ro_packed( pack_schema ) || bli_is_io_packed( pack_schema ) || bli_is_rpi_packed( pack_schema ) ) { // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); ps_p = ps_p / 2; } else { // Align the panel stride according to the micro-panel alignment. - ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_align ); + ps_p = bli_align_dim_to_size( ps_p, elem_size_p, upanel_b_align ); } // Store the strides and panel dimension in p. diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 2cdbae176..eb940be7b 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -43,7 +43,8 @@ blksz_t* gemm_mr; blksz_t* gemm_nr; blksz_t* gemm_kr; -blksz_t* gemm_upanel_align; +blksz_t* gemm_upanel_a_align; +blksz_t* gemm_upanel_b_align; func_t* gemm_ukrs; @@ -98,13 +99,19 @@ void bli_gemm_cntl_init() BLIS_DEFAULT_KR_Z, BLIS_PACKDIM_KR_Z ); - // Create object for micro-panel alignment (in bytes). - gemm_upanel_align + // Create objects for micro-panel alignment (in bytes). + gemm_upanel_a_align = - bli_blksz_obj_create( BLIS_UPANEL_ALIGN_SIZE_S, 0, - BLIS_UPANEL_ALIGN_SIZE_D, 0, - BLIS_UPANEL_ALIGN_SIZE_C, 0, - BLIS_UPANEL_ALIGN_SIZE_Z, 0 ); + bli_blksz_obj_create( BLIS_UPANEL_A_ALIGN_SIZE_S, 0, + BLIS_UPANEL_A_ALIGN_SIZE_D, 0, + BLIS_UPANEL_A_ALIGN_SIZE_C, 0, + BLIS_UPANEL_A_ALIGN_SIZE_Z, 0 ); + gemm_upanel_b_align + = + bli_blksz_obj_create( BLIS_UPANEL_B_ALIGN_SIZE_S, 0, + BLIS_UPANEL_B_ALIGN_SIZE_D, 0, + BLIS_UPANEL_B_ALIGN_SIZE_C, 0, + BLIS_UPANEL_B_ALIGN_SIZE_Z, 0 ); // Attach the register blksz_t objects as sub-blocksizes to the cache @@ -222,7 +229,8 @@ void bli_gemm_cntl_finalize() bli_blksz_obj_free( gemm_nr ); bli_blksz_obj_free( gemm_kr ); - bli_blksz_obj_free( gemm_upanel_align ); + bli_blksz_obj_free( gemm_upanel_a_align ); + bli_blksz_obj_free( gemm_upanel_b_align ); bli_func_obj_free( gemm_ukrs ); diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 1ccd7edb4..8c99db722 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -93,18 +93,10 @@ extern blksz_t* gemm_mc; extern blksz_t* gemm_nc; extern blksz_t* gemm_kc; -extern blksz_t* gemm_mr; -extern blksz_t* gemm_nr; -extern blksz_t* gemm_kr; - extern blksz_t* gemm4m_mc; extern blksz_t* gemm4m_nc; extern blksz_t* gemm4m_kc; -extern blksz_t* gemm4m_mr; -extern blksz_t* gemm4m_nr; -extern blksz_t* gemm4m_kr; - // -- Default cache blocksizes -- // MC default blocksizes @@ -227,6 +219,14 @@ gint_t bli_info_get_maximum_kc_z( void ) { bli_init(); return bli_blksz_max_for_ // -- Default register blocksizes -- +extern blksz_t* gemm_mr; +extern blksz_t* gemm_nr; +extern blksz_t* gemm_kr; + +extern blksz_t* gemm4m_mr; +extern blksz_t* gemm4m_nr; +extern blksz_t* gemm4m_kr; + // MR default blocksizes gint_t bli_info_get_default_mr( num_t dt ) @@ -325,6 +325,42 @@ gint_t bli_info_get_packdim_nr_z( void ) { bli_init(); return bli_blksz_max_for_ ( bli_4m_is_enabled_z() ? gemm4m_nr : gemm_nr ) ); } +// -- Micro-panel alignment -- + +extern blksz_t* gemm_upanel_a_align; +extern blksz_t* gemm_upanel_b_align; + +// Micro-panel alignment of A + +gint_t bli_info_get_upanel_a_align_size( num_t dt ) +{ + if ( bli_is_float ( dt ) ) return bli_info_get_upanel_a_align_size_s(); + else if ( bli_is_double ( dt ) ) return bli_info_get_upanel_a_align_size_d(); + else if ( bli_is_scomplex( dt ) ) return bli_info_get_upanel_a_align_size_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_info_get_upanel_a_align_size_z(); + else return 0; +} +gint_t bli_info_get_upanel_a_align_size_s( void ) { bli_init(); return bli_blksz_for_type( BLIS_FLOAT, gemm_upanel_a_align ); } +gint_t bli_info_get_upanel_a_align_size_d( void ) { bli_init(); return bli_blksz_for_type( BLIS_DOUBLE, gemm_upanel_a_align ); } +gint_t bli_info_get_upanel_a_align_size_c( void ) { bli_init(); return bli_blksz_for_type( BLIS_SCOMPLEX, gemm_upanel_a_align ); } +gint_t bli_info_get_upanel_a_align_size_z( void ) { bli_init(); return bli_blksz_for_type( BLIS_DCOMPLEX, gemm_upanel_a_align ); } + +// Micro-panel alignment of B + +gint_t bli_info_get_upanel_b_align_size( num_t dt ) +{ + if ( bli_is_float ( dt ) ) return bli_info_get_upanel_b_align_size_s(); + else if ( bli_is_double ( dt ) ) return bli_info_get_upanel_b_align_size_d(); + else if ( bli_is_scomplex( dt ) ) return bli_info_get_upanel_b_align_size_c(); + else if ( bli_is_dcomplex( dt ) ) return bli_info_get_upanel_b_align_size_z(); + else return 0; +} +gint_t bli_info_get_upanel_b_align_size_s( void ) { bli_init(); return bli_blksz_for_type( BLIS_FLOAT, gemm_upanel_b_align ); } +gint_t bli_info_get_upanel_b_align_size_d( void ) { bli_init(); return bli_blksz_for_type( BLIS_DOUBLE, gemm_upanel_b_align ); } +gint_t bli_info_get_upanel_b_align_size_c( void ) { bli_init(); return bli_blksz_for_type( BLIS_SCOMPLEX, gemm_upanel_b_align ); } +gint_t bli_info_get_upanel_b_align_size_z( void ) { bli_init(); return bli_blksz_for_type( BLIS_DCOMPLEX, gemm_upanel_b_align ); } + + // -- Level-2 cache blocksizes -- extern blksz_t* gemv_mc; diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 29cd84c1c..92e454ace 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -135,6 +135,22 @@ gint_t bli_info_get_packdim_nr_d( void ); gint_t bli_info_get_packdim_nr_c( void ); gint_t bli_info_get_packdim_nr_z( void ); +// -- Micro-panel alignment for A -- + +gint_t bli_info_get_upanel_a_align_size( num_t dt ); +gint_t bli_info_get_upanel_a_align_size_s( void ); +gint_t bli_info_get_upanel_a_align_size_d( void ); +gint_t bli_info_get_upanel_a_align_size_c( void ); +gint_t bli_info_get_upanel_a_align_size_z( void ); + +// -- Micro-panel alignment for B -- + +gint_t bli_info_get_upanel_b_align_size( num_t dt ); +gint_t bli_info_get_upanel_b_align_size_s( void ); +gint_t bli_info_get_upanel_b_align_size_d( void ); +gint_t bli_info_get_upanel_b_align_size_c( void ); +gint_t bli_info_get_upanel_b_align_size_z( void ); + // -- Level-2 cache blocksizes -- diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 68101cc91..b91284d73 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -1082,8 +1082,6 @@ #define BLIS_PACKDIM_KR_Z BLIS_DEFAULT_KR_Z #endif - - // // Define level-2 blocksizes. // @@ -1229,24 +1227,38 @@ #endif - // -- Define micro-panel alignment --------------------------------------------- -// In this section, we consider each datatype-specific micro-panel alignment -// size macro. If it is undefined, we define it to the default value (the size -// of the datatype). +// In this section, we consider each datatype-specific alignment sizes for +// micro-panels of A and B. If any definition is undefined, we define it to +// a safe default value (the size of the datatype). -#ifndef BLIS_UPANEL_ALIGN_SIZE_S -#define BLIS_UPANEL_ALIGN_SIZE_S BLIS_SIZEOF_S +// Alignment for micro-panels of A +#ifndef BLIS_UPANEL_A_ALIGN_SIZE_S +#define BLIS_UPANEL_A_ALIGN_SIZE_S BLIS_SIZEOF_S #endif -#ifndef BLIS_UPANEL_ALIGN_SIZE_D -#define BLIS_UPANEL_ALIGN_SIZE_D BLIS_SIZEOF_D +#ifndef BLIS_UPANEL_A_ALIGN_SIZE_D +#define BLIS_UPANEL_A_ALIGN_SIZE_D BLIS_SIZEOF_D #endif -#ifndef BLIS_UPANEL_ALIGN_SIZE_C -#define BLIS_UPANEL_ALIGN_SIZE_C BLIS_SIZEOF_C +#ifndef BLIS_UPANEL_A_ALIGN_SIZE_C +#define BLIS_UPANEL_A_ALIGN_SIZE_C BLIS_SIZEOF_C #endif -#ifndef BLIS_UPANEL_ALIGN_SIZE_Z -#define BLIS_UPANEL_ALIGN_SIZE_Z BLIS_SIZEOF_Z +#ifndef BLIS_UPANEL_A_ALIGN_SIZE_Z +#define BLIS_UPANEL_A_ALIGN_SIZE_Z BLIS_SIZEOF_Z +#endif + +// Alignment for micro-panels of B +#ifndef BLIS_UPANEL_B_ALIGN_SIZE_S +#define BLIS_UPANEL_B_ALIGN_SIZE_S BLIS_SIZEOF_S +#endif +#ifndef BLIS_UPANEL_B_ALIGN_SIZE_D +#define BLIS_UPANEL_B_ALIGN_SIZE_D BLIS_SIZEOF_D +#endif +#ifndef BLIS_UPANEL_B_ALIGN_SIZE_C +#define BLIS_UPANEL_B_ALIGN_SIZE_C BLIS_SIZEOF_C +#endif +#ifndef BLIS_UPANEL_B_ALIGN_SIZE_Z +#define BLIS_UPANEL_B_ALIGN_SIZE_Z BLIS_SIZEOF_Z #endif diff --git a/frame/include/bli_mem_pool_macro_defs.h b/frame/include/bli_mem_pool_macro_defs.h index 4cd33826e..8cab42b29 100644 --- a/frame/include/bli_mem_pool_macro_defs.h +++ b/frame/include/bli_mem_pool_macro_defs.h @@ -232,14 +232,14 @@ #define BLIS_MK_BLOCK_SIZE_S ( BLIS_POOL_MC_S * \ ( BLIS_POOL_KC_S + \ - ( BLIS_UPANEL_ALIGN_SIZE_S / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_S / \ BLIS_SIZEOF_S ) \ ) * \ BLIS_SIZEOF_S \ ) #define BLIS_KN_BLOCK_SIZE_S ( \ ( BLIS_POOL_KC_S + \ - ( BLIS_UPANEL_ALIGN_SIZE_S / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_S / \ BLIS_SIZEOF_S ) \ ) * \ BLIS_POOL_NC_S * \ @@ -256,14 +256,14 @@ #define BLIS_MK_BLOCK_SIZE_D ( BLIS_POOL_MC_D * \ ( BLIS_POOL_KC_D + \ - ( BLIS_UPANEL_ALIGN_SIZE_D / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_D / \ BLIS_SIZEOF_D ) \ ) * \ BLIS_SIZEOF_D \ ) #define BLIS_KN_BLOCK_SIZE_D ( \ ( BLIS_POOL_KC_D + \ - ( BLIS_UPANEL_ALIGN_SIZE_D / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_D / \ BLIS_SIZEOF_D ) \ ) * \ BLIS_POOL_NC_D * \ @@ -280,14 +280,14 @@ #define BLIS_MK_BLOCK_SIZE_C ( BLIS_POOL_MC_C * \ ( BLIS_POOL_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ BLIS_SIZEOF_C \ ) #define BLIS_KN_BLOCK_SIZE_C ( \ ( BLIS_POOL_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ BLIS_POOL_NC_C * \ @@ -304,14 +304,14 @@ #define BLIS_MK_BLOCK_SIZE_Z ( BLIS_POOL_MC_Z * \ ( BLIS_POOL_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ BLIS_SIZEOF_Z \ ) #define BLIS_KN_BLOCK_SIZE_Z ( \ ( BLIS_POOL_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ BLIS_POOL_NC_Z * \ @@ -328,14 +328,14 @@ #define BLIS_MK_BLOCK_SIZE_4M_C ( BLIS_POOL_4M_MC_C * \ ( BLIS_POOL_4M_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ BLIS_SIZEOF_C \ ) #define BLIS_KN_BLOCK_SIZE_4M_C ( \ ( BLIS_POOL_4M_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ BLIS_POOL_4M_NC_C * \ @@ -352,14 +352,14 @@ #define BLIS_MK_BLOCK_SIZE_4M_Z ( BLIS_POOL_4M_MC_Z * \ ( BLIS_POOL_4M_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ BLIS_SIZEOF_Z \ ) #define BLIS_KN_BLOCK_SIZE_4M_Z ( \ ( BLIS_POOL_4M_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ BLIS_POOL_4M_NC_Z * \ @@ -378,7 +378,7 @@ #define BLIS_MK_BLOCK_SIZE_3M_C ( BLIS_POOL_3M_MC_C * \ ( BLIS_POOL_3M_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ ( BLIS_SIZEOF_C * \ @@ -387,7 +387,7 @@ ) #define BLIS_KN_BLOCK_SIZE_3M_C ( \ ( BLIS_POOL_3M_KC_C + \ - ( BLIS_UPANEL_ALIGN_SIZE_C / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_C / \ BLIS_SIZEOF_C ) \ ) * \ BLIS_POOL_3M_NC_C * \ @@ -410,7 +410,7 @@ #define BLIS_MK_BLOCK_SIZE_3M_Z ( BLIS_POOL_3M_MC_Z * \ ( BLIS_POOL_3M_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_A_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ ( BLIS_SIZEOF_Z * \ @@ -419,7 +419,7 @@ ) #define BLIS_KN_BLOCK_SIZE_3M_Z ( \ ( BLIS_POOL_3M_KC_Z + \ - ( BLIS_UPANEL_ALIGN_SIZE_Z / \ + ( BLIS_UPANEL_B_ALIGN_SIZE_Z / \ BLIS_SIZEOF_Z ) \ ) * \ BLIS_POOL_3M_NC_Z * \ diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 42a323f9f..4a1c8e088 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -686,12 +686,17 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) ( int )bli_info_get_packdim_nr_c(), ( int )bli_info_get_packdim_nr_z() ); libblis_test_fprintf_c( os, "\n" ); - libblis_test_fprintf_c( os, "micro-panel alignment s d c z \n" ); - libblis_test_fprintf_c( os, " sizes (bytes) %7d %7d %7d %7d\n", - ( int )BLIS_UPANEL_ALIGN_SIZE_S, - ( int )BLIS_UPANEL_ALIGN_SIZE_D, - ( int )BLIS_UPANEL_ALIGN_SIZE_C, - ( int )BLIS_UPANEL_ALIGN_SIZE_Z ); + libblis_test_fprintf_c( os, "micro-panel alignment (bytes) s d c z \n" ); + libblis_test_fprintf_c( os, " A (left matrix) %7d %7d %7d %7d\n", + ( int )bli_info_get_upanel_a_align_size_s(), + ( int )bli_info_get_upanel_a_align_size_d(), + ( int )bli_info_get_upanel_a_align_size_c(), + ( int )bli_info_get_upanel_a_align_size_z() ); + libblis_test_fprintf_c( os, " B (right matrix) %7d %7d %7d %7d\n", + ( int )bli_info_get_upanel_b_align_size_s(), + ( int )bli_info_get_upanel_b_align_size_d(), + ( int )bli_info_get_upanel_b_align_size_c(), + ( int )bli_info_get_upanel_b_align_size_z() ); libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "level-2 cache blocksizes s d c z \n" ); libblis_test_fprintf_c( os, " m dimension %7d %7d %7d %7d\n",