diff --git a/frame/3/gemm/3m/bli_gemm3m_cntl.c b/frame/3/gemm/3m/bli_gemm3m_cntl.c index 0d9e5f63b..1f12f323f 100644 --- a/frame/3/gemm/3m/bli_gemm3m_cntl.c +++ b/frame/3/gemm/3m/bli_gemm3m_cntl.c @@ -106,10 +106,13 @@ void bli_gemm3m_cntl_init() // Create function pointer object for each datatype-specific gemm // micro-kernel. - gemm3m_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMM3M_UKERNEL, - BLIS_ZGEMM3M_UKERNEL ); + gemm3m_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM3M_UKERNEL, BLIS_CGEMM3M_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM3M_UKERNEL, BLIS_ZGEMM3M_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. @@ -161,7 +164,7 @@ void bli_gemm3m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, + gemm3m_ukrs, NULL, gemm3m_packa_cntl, gemm3m_packb_cntl, @@ -176,7 +179,7 @@ void bli_gemm3m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, @@ -191,7 +194,7 @@ void bli_gemm3m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/gemm/4m/bli_gemm4m_cntl.c b/frame/3/gemm/4m/bli_gemm4m_cntl.c index 297aa8ea9..3f83cedd3 100644 --- a/frame/3/gemm/4m/bli_gemm4m_cntl.c +++ b/frame/3/gemm/4m/bli_gemm4m_cntl.c @@ -106,10 +106,13 @@ void bli_gemm4m_cntl_init() // Create function pointer object for each datatype-specific gemm // micro-kernel. - gemm4m_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMM4M_UKERNEL, - BLIS_ZGEMM4M_UKERNEL ); + gemm4m_ukrs + = + bli_func_obj_create( + NULL, FALSE, + NULL, FALSE, + BLIS_CGEMM4M_UKERNEL, BLIS_CGEMM4M_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM4M_UKERNEL, BLIS_ZGEMM4M_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. @@ -161,7 +164,7 @@ void bli_gemm4m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, + gemm4m_ukrs, NULL, gemm4m_packa_cntl, gemm4m_packb_cntl, @@ -176,7 +179,7 @@ void bli_gemm4m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, @@ -191,7 +194,7 @@ void bli_gemm4m_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index 2e3c6324a..59101375b 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -105,10 +105,12 @@ void bli_gemm_cntl_init() // Create function pointer object for each datatype-specific gemm // micro-kernel. - gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL, - BLIS_DGEMM_UKERNEL, - BLIS_CGEMM_UKERNEL, - BLIS_ZGEMM_UKERNEL ); + gemm_ukrs + = + bli_func_obj_create( BLIS_SGEMM_UKERNEL, BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_DGEMM_UKERNEL, BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_CGEMM_UKERNEL, BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS, + BLIS_ZGEMM_UKERNEL, BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS ); // Create control tree objects for packm operations. @@ -160,7 +162,7 @@ void bli_gemm_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, + gemm_ukrs, NULL, gemm_packa_cntl, gemm_packb_cntl, @@ -175,7 +177,7 @@ void bli_gemm_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, + gemm_ukrs, NULL, NULL, NULL, @@ -190,7 +192,7 @@ void bli_gemm_cntl_init() bli_gemm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, + gemm_ukrs, NULL, NULL, NULL, diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c index 090a87972..fce54dcf2 100644 --- a/frame/3/gemm/bli_gemm_front.c +++ b/frame/3/gemm/bli_gemm_front.c @@ -61,11 +61,18 @@ void bli_gemm_front( obj_t* alpha, bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and B is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( *c ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_obj_swap( a_local, b_local ); diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c index d7379a2f4..b83a69e8d 100644 --- a/frame/3/hemm/bli_hemm_front.c +++ b/frame/3/hemm/bli_hemm_front.c @@ -62,10 +62,18 @@ void bli_hemm_front( side_t side, bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. - if ( bli_obj_is_row_stored( *c ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_toggle_side( side ); bli_obj_toggle_conj( a_local ); diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c index dd0ed5321..a76d34b2b 100644 --- a/frame/3/her2k/bli_her2k_front.c +++ b/frame/3/her2k/bli_her2k_front.c @@ -82,11 +82,18 @@ void bli_her2k_front( obj_t* alpha, alpha, &alpha_conj ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and A' is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_obj_swap( a_local, bh_local ); bli_obj_swap( b_local, ah_local ); diff --git a/frame/3/herk/3m/bli_herk3m_cntl.c b/frame/3/herk/3m/bli_herk3m_cntl.c index 565357c71..c313d907d 100644 --- a/frame/3/herk/3m/bli_herk3m_cntl.c +++ b/frame/3/herk/3m/bli_herk3m_cntl.c @@ -103,7 +103,7 @@ void bli_herk3m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, + gemm3m_ukrs, NULL, herk3m_packa_cntl, herk3m_packb_cntl, @@ -118,7 +118,7 @@ void bli_herk3m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, @@ -133,7 +133,7 @@ void bli_herk3m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/herk/4m/bli_herk4m_cntl.c b/frame/3/herk/4m/bli_herk4m_cntl.c index 0674a6f54..61b029b33 100644 --- a/frame/3/herk/4m/bli_herk4m_cntl.c +++ b/frame/3/herk/4m/bli_herk4m_cntl.c @@ -103,7 +103,7 @@ void bli_herk4m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, + gemm4m_ukrs, NULL, herk4m_packa_cntl, herk4m_packb_cntl, @@ -118,7 +118,7 @@ void bli_herk4m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, @@ -133,7 +133,7 @@ void bli_herk4m_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/herk/bli_herk_cntl.c b/frame/3/herk/bli_herk_cntl.c index 12a27013e..3a394bf23 100644 --- a/frame/3/herk/bli_herk_cntl.c +++ b/frame/3/herk/bli_herk_cntl.c @@ -103,7 +103,7 @@ void bli_herk_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, + gemm_ukrs, NULL, herk_packa_cntl, herk_packb_cntl, @@ -118,7 +118,7 @@ void bli_herk_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, + gemm_ukrs, NULL, NULL, NULL, @@ -133,7 +133,7 @@ void bli_herk_cntl_init() bli_herk_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, + gemm_ukrs, NULL, NULL, NULL, diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c index 0b0b23dcd..ca180a9ab 100644 --- a/frame/3/herk/bli_herk_front.c +++ b/frame/3/herk/bli_herk_front.c @@ -67,11 +67,18 @@ void bli_herk_front( obj_t* alpha, bli_obj_induce_trans( ah_local ); bli_obj_toggle_conj( ah_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and A' is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_obj_toggle_conj( a_local ); bli_obj_toggle_conj( ah_local ); diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c index 7ad10b1b5..c8b8b0aaa 100644 --- a/frame/3/symm/bli_symm_front.c +++ b/frame/3/symm/bli_symm_front.c @@ -62,10 +62,18 @@ void bli_symm_front( side_t side, bli_obj_alias_to( *b, b_local ); bli_obj_alias_to( *c, c_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. - if ( bli_obj_is_row_stored( *c ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_toggle_side( side ); bli_obj_induce_trans( b_local ); diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c index 588eac655..d92a2a23d 100644 --- a/frame/3/syr2k/bli_syr2k_front.c +++ b/frame/3/syr2k/bli_syr2k_front.c @@ -71,11 +71,18 @@ void bli_syr2k_front( obj_t* alpha, bli_obj_alias_to( *a, at_local ); bli_obj_induce_trans( at_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and A' is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_obj_induce_trans( c_local ); } diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c index 7c5d6f526..358a2c4b7 100644 --- a/frame/3/syrk/bli_syrk_front.c +++ b/frame/3/syrk/bli_syrk_front.c @@ -64,11 +64,18 @@ void bli_syrk_front( obj_t* alpha, bli_obj_alias_to( *a, at_local ); bli_obj_induce_trans( at_local ); - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and A^T is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( cntl ) ) ) + ) { bli_obj_induce_trans( c_local ); } diff --git a/frame/3/trmm/3m/bli_trmm3m_cntl.c b/frame/3/trmm/3m/bli_trmm3m_cntl.c index 91d7d5c4d..953ec75d3 100644 --- a/frame/3/trmm/3m/bli_trmm3m_cntl.c +++ b/frame/3/trmm/3m/bli_trmm3m_cntl.c @@ -149,7 +149,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, + gemm3m_ukrs, NULL, trmm3m_l_packa_cntl, trmm3m_l_packb_cntl, @@ -165,7 +165,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, @@ -181,7 +181,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, @@ -197,7 +197,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, + gemm3m_ukrs, NULL, trmm3m_r_packa_cntl, trmm3m_r_packb_cntl, @@ -213,7 +213,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, @@ -229,7 +229,7 @@ void bli_trmm3m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, + gemm3m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/trmm/4m/bli_trmm4m_cntl.c b/frame/3/trmm/4m/bli_trmm4m_cntl.c index 6e08956b8..f7d30b31c 100644 --- a/frame/3/trmm/4m/bli_trmm4m_cntl.c +++ b/frame/3/trmm/4m/bli_trmm4m_cntl.c @@ -149,7 +149,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, + gemm4m_ukrs, NULL, trmm4m_l_packa_cntl, trmm4m_l_packb_cntl, @@ -165,7 +165,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, @@ -181,7 +181,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, @@ -197,7 +197,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, + gemm4m_ukrs, NULL, trmm4m_r_packa_cntl, trmm4m_r_packb_cntl, @@ -213,7 +213,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, @@ -229,7 +229,7 @@ void bli_trmm4m_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, + gemm4m_ukrs, NULL, NULL, NULL, diff --git a/frame/3/trmm/bli_trmm_cntl.c b/frame/3/trmm/bli_trmm_cntl.c index 0bc88fe2b..6c46cd40d 100644 --- a/frame/3/trmm/bli_trmm_cntl.c +++ b/frame/3/trmm/bli_trmm_cntl.c @@ -149,7 +149,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, + gemm_ukrs, NULL, trmm_l_packa_cntl, trmm_l_packb_cntl, @@ -165,7 +165,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, + gemm_ukrs, NULL, NULL, NULL, @@ -181,7 +181,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, + gemm_ukrs, NULL, NULL, NULL, @@ -197,7 +197,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, + gemm_ukrs, NULL, trmm_r_packa_cntl, trmm_r_packb_cntl, @@ -213,7 +213,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, + gemm_ukrs, NULL, NULL, NULL, @@ -229,7 +229,7 @@ void bli_trmm_cntl_init() bli_trmm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, + gemm_ukrs, NULL, NULL, NULL, diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c index 04d2dd5f3..9ff4fbb4c 100644 --- a/frame/3/trmm/bli_trmm_front.c +++ b/frame/3/trmm/bli_trmm_front.c @@ -80,17 +80,39 @@ void bli_trmm_front( side_t side, } #if 0 + + // If A is being multiplied from the right, transpose all operands + // so that we can perform the computation as if A were being multiplied + // from the left. if ( bli_is_right( side ) ) { + bli_toggle_side( side ); + bli_obj_induce_trans( a_local ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); + } + +#else + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( l_cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( l_cntl ) ) ) + ) + { + bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); } -#endif -#if 1 // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) @@ -98,20 +120,6 @@ void bli_trmm_front( side_t side, bli_obj_swap( a_local, b_local ); } - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and B is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) - { - bli_obj_swap( a_local, b_local ); - - bli_obj_induce_trans( a_local ); - bli_obj_induce_trans( b_local ); - bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); - } #endif // Set each alias as the root object. diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c index be67037f3..e8ab2b942 100644 --- a/frame/3/trmm3/bli_trmm3_front.c +++ b/frame/3/trmm3/bli_trmm3_front.c @@ -82,17 +82,39 @@ void bli_trmm3_front( side_t side, } #if 0 + + // If A is being multiplied from the right, transpose all operands + // so that we can perform the computation as if A were being multiplied + // from the left. if ( bli_is_right( side ) ) { + bli_toggle_side( side ); + bli_obj_induce_trans( a_local ); + bli_obj_induce_trans( b_local ); + bli_obj_induce_trans( c_local ); + } + +#else + + // An optimization: If C is stored by rows and the micro-kernel prefers + // contiguous columns, or if C is stored by columns and the micro-kernel + // prefers contiguous rows, transpose the entire operation to allow the + // micro-kernel to access elements of C in its preferred manner. + if ( + ( bli_obj_is_row_stored( c_local ) && + bli_func_prefers_contig_cols( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( l_cntl ) ) ) || + ( bli_obj_is_col_stored( c_local ) && + bli_func_prefers_contig_rows( bli_obj_datatype( c_local ), + cntl_gemm_ukrs( l_cntl ) ) ) + ) + { + bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); } -#endif -#if 1 // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) @@ -100,20 +122,6 @@ void bli_trmm3_front( side_t side, bli_obj_swap( a_local, b_local ); } - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and B is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) - { - bli_obj_swap( a_local, b_local ); - - bli_obj_induce_trans( a_local ); - bli_obj_induce_trans( b_local ); - bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); - } #endif // Set each alias as the root object. diff --git a/frame/3/trsm/3m/bli_trsm3m_cntl.c b/frame/3/trsm/3m/bli_trsm3m_cntl.c index d4db6ff39..d40c7b44c 100644 --- a/frame/3/trsm/3m/bli_trsm3m_cntl.c +++ b/frame/3/trsm/3m/bli_trsm3m_cntl.c @@ -73,15 +73,19 @@ void bli_trsm3m_cntl_init() // Create function pointer objects for each datatype-specific // gemmtrsm3m_l and gemmtrsm3m_u micro-kernel. - gemmtrsm3m_l_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMMTRSM3M_L_UKERNEL, - BLIS_ZGEMMTRSM3M_L_UKERNEL ); + gemmtrsm3m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CGEMMTRSM3M_L_UKERNEL, FALSE, + BLIS_ZGEMMTRSM3M_L_UKERNEL, FALSE ); - gemmtrsm3m_u_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMMTRSM3M_U_UKERNEL, - BLIS_ZGEMMTRSM3M_U_UKERNEL ); + gemmtrsm3m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CGEMMTRSM3M_U_UKERNEL, FALSE, + BLIS_ZGEMMTRSM3M_U_UKERNEL, FALSE ); // Create control tree objects for packm operations (left side). @@ -162,7 +166,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, trsm3m_l_packa_cntl, trsm3m_l_packb_cntl, @@ -178,7 +182,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -194,7 +198,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -210,7 +214,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm3m_mc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, trsm3m_r_packa_cntl, trsm3m_r_packb_cntl, @@ -226,7 +230,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm3m_kc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -242,7 +246,7 @@ void bli_trsm3m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm3m_nc, - NULL, NULL, NULL, + gemm3m_ukrs, NULL, NULL, NULL, NULL, NULL, diff --git a/frame/3/trsm/4m/bli_trsm4m_cntl.c b/frame/3/trsm/4m/bli_trsm4m_cntl.c index 7f2520bcd..9fb9e9251 100644 --- a/frame/3/trsm/4m/bli_trsm4m_cntl.c +++ b/frame/3/trsm/4m/bli_trsm4m_cntl.c @@ -73,15 +73,20 @@ void bli_trsm4m_cntl_init() // Create function pointer objects for each datatype-specific // gemmtrsm4m_l and gemmtrsm4m_u micro-kernel. - gemmtrsm4m_l_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMMTRSM4M_L_UKERNEL, - BLIS_ZGEMMTRSM4M_L_UKERNEL ); + gemmtrsm4m_l_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CGEMMTRSM4M_L_UKERNEL, FALSE, + BLIS_ZGEMMTRSM4M_L_UKERNEL, FALSE ); + + gemmtrsm4m_u_ukrs + = + bli_func_obj_create( NULL, FALSE, + NULL, FALSE, + BLIS_CGEMMTRSM4M_U_UKERNEL, FALSE, + BLIS_ZGEMMTRSM4M_U_UKERNEL, FALSE ); - gemmtrsm4m_u_ukrs = bli_func_obj_create( NULL, - NULL, - BLIS_CGEMMTRSM4M_U_UKERNEL, - BLIS_ZGEMMTRSM4M_U_UKERNEL ); // Create control tree objects for packm operations (left side). @@ -162,7 +167,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, trsm4m_l_packa_cntl, trsm4m_l_packb_cntl, @@ -178,7 +183,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -194,7 +199,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -210,7 +215,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm4m_mc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, trsm4m_r_packa_cntl, trsm4m_r_packb_cntl, @@ -226,7 +231,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm4m_kc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -242,7 +247,7 @@ void bli_trsm4m_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm4m_nc, - NULL, NULL, NULL, + gemm4m_ukrs, NULL, NULL, NULL, NULL, NULL, diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index b26b6a43d..3dce8d72d 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -75,15 +75,19 @@ void bli_trsm_cntl_init() // Create function pointer objects for each datatype-specific // gemmtrsm_l and gemmtrsm_u micro-kernel. - gemmtrsm_l_ukrs = bli_func_obj_create( BLIS_SGEMMTRSM_L_UKERNEL, - BLIS_DGEMMTRSM_L_UKERNEL, - BLIS_CGEMMTRSM_L_UKERNEL, - BLIS_ZGEMMTRSM_L_UKERNEL ); + gemmtrsm_l_ukrs + = + bli_func_obj_create( BLIS_SGEMMTRSM_L_UKERNEL, FALSE, + BLIS_DGEMMTRSM_L_UKERNEL, FALSE, + BLIS_CGEMMTRSM_L_UKERNEL, FALSE, + BLIS_ZGEMMTRSM_L_UKERNEL, FALSE ); - gemmtrsm_u_ukrs = bli_func_obj_create( BLIS_SGEMMTRSM_U_UKERNEL, - BLIS_DGEMMTRSM_U_UKERNEL, - BLIS_CGEMMTRSM_U_UKERNEL, - BLIS_ZGEMMTRSM_U_UKERNEL ); + gemmtrsm_u_ukrs + = + bli_func_obj_create( BLIS_SGEMMTRSM_U_UKERNEL, FALSE, + BLIS_DGEMMTRSM_U_UKERNEL, FALSE, + BLIS_CGEMMTRSM_U_UKERNEL, FALSE, + BLIS_ZGEMMTRSM_U_UKERNEL, FALSE ); // Create control tree objects for packm operations (left side). @@ -164,7 +168,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, trsm_l_packa_cntl, trsm_l_packb_cntl, @@ -180,7 +184,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -196,7 +200,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -212,7 +216,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, gemm_mc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, trsm_r_packa_cntl, trsm_r_packb_cntl, @@ -228,7 +232,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT3, gemm_kc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, NULL, NULL, @@ -244,7 +248,7 @@ void bli_trsm_cntl_init() bli_trsm_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, gemm_nc, - NULL, NULL, NULL, + gemm_ukrs, NULL, NULL, NULL, NULL, NULL, diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index 04a683117..686ce6bce 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -80,38 +80,27 @@ void bli_trsm_front( side_t side, } #if 0 + + // If A is being solved against from the right, transpose all operands + // so that we can perform the computation as if A were being solved + // from the left. if ( bli_is_right( side ) ) { + bli_toggle_side( side ); bli_obj_induce_trans( a_local ); bli_obj_induce_trans( b_local ); bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); } -#endif -#if 1 +#else + // If A is being solved against from the right, swap A and B so that - // the matrix will actually be on the right. + // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( a_local, b_local ); } - // An optimization: If C is row-stored, transpose the entire operation - // so as to allow the macro-kernel more favorable access patterns - // through C. (The effect of the transposition of A and B is negligible - // because those operands are always packed to contiguous memory.) - if ( bli_obj_is_row_stored( c_local ) ) - { - bli_obj_swap( a_local, b_local ); - - bli_obj_induce_trans( a_local ); - bli_obj_induce_trans( b_local ); - bli_obj_induce_trans( c_local ); - - bli_toggle_side( side ); - } #endif // Set each alias as the root object. diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c index f8dd58a3d..f75596af7 100644 --- a/frame/base/bli_func.c +++ b/frame/base/bli_func.c @@ -35,35 +35,40 @@ #include "blis.h" -func_t* bli_func_obj_create( void* f_s, - void* f_d, - void* f_c, - void* f_z ) +func_t* bli_func_obj_create( void* ptr_s, bool_t pref_s, + void* ptr_d, bool_t pref_d, + void* ptr_c, bool_t pref_c, + void* ptr_z, bool_t pref_z ) { func_t* f; f = ( func_t* ) bli_malloc( sizeof(func_t) ); bli_func_obj_init( f, - f_s, - f_d, - f_c, - f_z ); + ptr_s, pref_s, + ptr_d, pref_d, + ptr_c, pref_c, + ptr_z, pref_z ); return f; } void bli_func_obj_init( func_t* f, - void* f_s, - void* f_d, - void* f_c, - void* f_z ) + void* ptr_s, bool_t pref_s, + void* ptr_d, bool_t pref_d, + void* ptr_c, bool_t pref_c, + void* ptr_z, bool_t pref_z ) { - f->f[BLIS_BITVAL_FLOAT_TYPE] = f_s; - f->f[BLIS_BITVAL_DOUBLE_TYPE] = f_d; - f->f[BLIS_BITVAL_SCOMPLEX_TYPE] = f_c; - f->f[BLIS_BITVAL_DCOMPLEX_TYPE] = f_z; + f->ptr[BLIS_BITVAL_FLOAT_TYPE] = ptr_s; + f->ptr[BLIS_BITVAL_DOUBLE_TYPE] = ptr_d; + f->ptr[BLIS_BITVAL_SCOMPLEX_TYPE] = ptr_c; + f->ptr[BLIS_BITVAL_DCOMPLEX_TYPE] = ptr_z; + + f->prefers_contig_rows[BLIS_BITVAL_FLOAT_TYPE] = pref_s; + f->prefers_contig_rows[BLIS_BITVAL_DOUBLE_TYPE] = pref_d; + f->prefers_contig_rows[BLIS_BITVAL_SCOMPLEX_TYPE] = pref_c; + f->prefers_contig_rows[BLIS_BITVAL_DCOMPLEX_TYPE] = pref_z; } @@ -76,6 +81,39 @@ void bli_func_obj_free( func_t* f ) void* bli_func_obj_query( num_t dt, func_t* f ) { - return f->f[ dt ]; + return f->ptr[ dt ]; +} + +bool_t bli_func_prefers_contig_rows( num_t dt, + func_t* f ) +{ + return f->prefers_contig_rows[ dt ]; +} + +bool_t bli_func_prefers_contig_cols( num_t dt, + func_t* f ) +{ + return !(f->prefers_contig_rows[ dt ]); +} + +bool_t bli_func_pref_is_sat_by( obj_t* a, + func_t* f ) +{ + num_t dt = bli_obj_datatype( *a ); + bool_t r_val = FALSE; + + if ( ( bli_obj_is_row_stored( *a ) && + bli_func_prefers_contig_rows( dt, f ) ) || + ( bli_obj_is_col_stored( *a ) && + bli_func_prefers_contig_cols( dt, f ) ) ) + r_val = TRUE; + + return r_val; +} + +bool_t bli_func_pref_is_unsat_by( obj_t* a, + func_t* f ) +{ + return !bli_func_pref_is_sat_by( a, f ); } diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h index 963cf0e4c..441f7df82 100644 --- a/frame/base/bli_func.h +++ b/frame/base/bli_func.h @@ -33,16 +33,16 @@ */ -func_t* bli_func_obj_create( void* f_s, - void* f_d, - void* f_c, - void* f_z ); +func_t* bli_func_obj_create( void* ptr_s, bool_t pref_s, + void* ptr_d, bool_t pref_d, + void* ptr_c, bool_t pref_c, + void* ptr_z, bool_t pref_z ); void bli_func_obj_init( func_t* f, - void* f_s, - void* f_d, - void* f_c, - void* f_z ); + void* ptr_s, bool_t pref_s, + void* ptr_d, bool_t pref_d, + void* ptr_c, bool_t pref_c, + void* ptr_z, bool_t pref_z ); void bli_func_obj_free( func_t* f ); @@ -50,3 +50,15 @@ void bli_func_obj_free( func_t* f ); void* bli_func_obj_query( num_t dt, func_t* f ); +bool_t bli_func_prefers_contig_rows( num_t dt, + func_t* f ); + +bool_t bli_func_prefers_contig_cols( num_t dt, + func_t* f ); + +bool_t bli_func_pref_is_sat_by( obj_t* a, + func_t* f ); + +bool_t bli_func_pref_is_unsat_by( obj_t* a, + func_t* f ); + diff --git a/frame/include/bli_kernel_3m_macro_defs.h b/frame/include/bli_kernel_3m_macro_defs.h index 3e7bd738d..b21896d52 100644 --- a/frame/include/bli_kernel_3m_macro_defs.h +++ b/frame/include/bli_kernel_3m_macro_defs.h @@ -36,6 +36,16 @@ #define BLIS_KERNEL_3M_MACRO_DEFS_H +// -- Define row access bools -------------------------------------------------- + +// gemm3m micro-kernels + +#define BLIS_CGEMM3M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM3M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + // -- Define default 3m-specific kernel names ---------------------------------- // diff --git a/frame/include/bli_kernel_4m_macro_defs.h b/frame/include/bli_kernel_4m_macro_defs.h index 8789f4cb6..0cdb8ba39 100644 --- a/frame/include/bli_kernel_4m_macro_defs.h +++ b/frame/include/bli_kernel_4m_macro_defs.h @@ -36,6 +36,16 @@ #define BLIS_KERNEL_4M_MACRO_DEFS_H +// -- Define row access bools -------------------------------------------------- + +// gemm4m micro-kernels + +#define BLIS_CGEMM4M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM4M_UKERNEL_PREFERS_CONTIG_ROWS \ + BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS + + // -- Define default 4m-specific kernel names ---------------------------------- // diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index 284dbe07a..24c617579 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -36,16 +36,53 @@ #define BLIS_KERNEL_MACRO_DEFS_H -// -- Construct kernel function names ------------------------------------------ +// -- Define row access bools -------------------------------------------------- + +// In this section we consider each datatype-specific "prefers contiguous rows" +// macro. If it is defined, we re-define it to be 1 (TRUE); otherwise, we +// define it to be 0 (FALSE). + +// gemm micro-kernels + +#ifdef BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#undef BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS 1 +#else +#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS 0 +#endif + +#ifdef BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#undef BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS 1 +#else +#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS 0 +#endif + +#ifdef BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#undef BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS 1 +#else +#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS 0 +#endif + +#ifdef BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#undef BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS 1 +#else +#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS 0 +#endif + + +// -- Define default kernel names ---------------------------------------------- // In this section we consider each datatype-specific micro-kernel macro; // if it is undefined, we define it to be the corresponding reference kernel. -// In the case of complex gemm micro-kernels, we also define special _VIA_4M -// macros so that later on we can tell whether or not to employ the 4m -// implementations. Note that in order to properly determine whether 4m is a -// viable option, we need to be able to test the existence of the real gemm -// micro-kernels, which means we must consider the complex gemm micro-kernel -// cases *BEFORE* the real cases. +// In the case of complex gemm micro-kernels, we also define special macros so +// that later on we can tell whether or not to employ the 4m implementations. +// Note that in order to properly determine whether/ 4m is a viable option, we +// need to be able to test the existence of the real gemm micro-kernels, which +// means we must consider the complex gemm micro-kernel cases *BEFORE* the +// real cases. // // Level-3 diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index f27cb3df7..1e01dabdf 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -515,8 +515,11 @@ typedef struct blksz_s typedef struct func_s { - // Primary blocksize values. - void* f[BLIS_NUM_FP_TYPES]; + // Kernel function address. + void* ptr[BLIS_NUM_FP_TYPES]; + + // Kernel row/column storage preference. + bool_t prefers_contig_rows[BLIS_NUM_FP_TYPES]; } func_t;