From 031978d2647cf08316858baf29c84ebba9c3133e Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 16 Nov 2016 14:04:33 -0600 Subject: [PATCH 1/2] Fixed inactive trsm_r blocksize constraint code. Details: - Changed a cpp macro that was meant to prevent using certain trsm_r code if BLIS_RELAX_MCNR_NCMR_CONSTRAINTS was defined. It was actually coded incorrectly at first. I've now fixed its location and changed its consequence to a compile-time #error message. --- frame/3/trsm/bli_trsm_front.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c index c38a193f8..eda248b6b 100644 --- a/frame/3/trsm/bli_trsm_front.c +++ b/frame/3/trsm/bli_trsm_front.c @@ -86,9 +86,6 @@ void bli_trsm_front } #if 0 - // NOTE: Enabling this code requires that BLIS be configured with - // BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined. -#ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved @@ -101,9 +98,14 @@ void bli_trsm_front bli_obj_induce_trans( c_local ); } -#endif #else + // NOTE: Enabling this code requires that BLIS NOT be configured with + // BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined. +#ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS + #error "BLIS_RELAX_MCNR_NCMR_CONSTRAINTS must not be defined for current trsm_r implementation." +#endif + // If A is being solved against from the right, swap A and B so that // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) From bdc0a264d2fb5940bfd09298b1de823674a39053 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 16 Nov 2016 14:13:08 -0600 Subject: [PATCH 2/2] Adjusted stride selection of ct in macrokernels. Details: - Updated the changes introduced in 618f433 so that the strides of the temporary microtile ct used in the macrokernels is determined based on the storage preference of the microkernel (via the new functions below), rather than the strides of c. In almost all cases, presently, this change results in no net effect, as a high-level optimization in the _front() functions aligns the storage of c to that of the microkernel's preference. However, I encountered some cases where this is not always the case in some development code that has yet to be committed, and therefore I'm generalizing the framework code in advance. - Defined two new functions in bli_cntx.c: bli_cntx_l3_ukr_prefers_rows_dt() bli_cntx_l3_ukr_prefers_cols_dt() which return bool_t's based on the current micro-kernel's storage preferences. For induced methods, the preference of the underlying real domain microkernel is returned. - Updated definition of bli_cntx_l3_ukr_dislikes_storage_of(), and by proxy bli_cntx_l3_ukr_prefers_storage_of(), to be in terms of the above functions, rather than querying the preferences of the native microkernel directly (which did the wrong thing for induced methods). --- frame/3/gemm/bli_gemm_ker_var2.c | 5 ++-- frame/3/gemm/ind/bli_gemm3m2_ker_var2.c | 5 ++-- frame/3/gemm/ind/bli_gemm4mb_ker_var2.c | 5 ++-- frame/3/herk/bli_herk_l_ker_var2.c | 5 ++-- frame/3/herk/bli_herk_u_ker_var2.c | 5 ++-- frame/3/trmm/bli_trmm_ll_ker_var2.c | 5 ++-- frame/3/trmm/bli_trmm_lu_ker_var2.c | 5 ++-- frame/3/trmm/bli_trmm_rl_ker_var2.c | 5 ++-- frame/3/trmm/bli_trmm_ru_ker_var2.c | 5 ++-- frame/3/trsm/bli_trsm_ll_ker_var2.c | 5 ++-- frame/3/trsm/bli_trsm_lu_ker_var2.c | 5 ++-- frame/3/trsm/bli_trsm_rl_ker_var2.c | 5 ++-- frame/3/trsm/bli_trsm_ru_ker_var2.c | 5 ++-- frame/base/bli_cntx.c | 33 +++++++++++++++++++------ frame/base/bli_cntx.h | 6 +++++ 15 files changed, 71 insertions(+), 33 deletions(-) diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index 69b30a52e..8af29594d 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -172,8 +172,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ diff --git a/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c index ea8904183..195a1ee28 100644 --- a/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm3m2_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = MR; \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict one = PASTEMAC(ch,1); \ diff --git a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c index d9d714917..b5788ae5e 100644 --- a/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c +++ b/frame/3/gemm/ind/bli_gemm4mb_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = 1; \ - const inc_t cs_ct = MR; \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict one = PASTEMAC(ch,1); \ diff --git a/frame/3/herk/bli_herk_l_ker_var2.c b/frame/3/herk/bli_herk_l_ker_var2.c index ce2bdb6b4..11cf2f5fa 100644 --- a/frame/3/herk/bli_herk_l_ker_var2.c +++ b/frame/3/herk/bli_herk_l_ker_var2.c @@ -177,8 +177,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ diff --git a/frame/3/herk/bli_herk_u_ker_var2.c b/frame/3/herk/bli_herk_u_ker_var2.c index 771087c24..ab1e336e3 100644 --- a/frame/3/herk/bli_herk_u_ker_var2.c +++ b/frame/3/herk/bli_herk_u_ker_var2.c @@ -177,8 +177,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 2aa127f76..378d8d4f2 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index ee7308d49..b572fe500 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 5502d3cf4..a3b48065d 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index fd5267dd6..47a878bfd 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -169,8 +169,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index f20c3a021..c5e9e06f5 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -173,8 +173,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 1f8a09d08..f8a363b50 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -173,8 +173,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 4e34454f1..91f5c8598 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -178,8 +178,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 572c47402..8ec330c0b 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -178,8 +178,9 @@ void PASTEMAC(ch,varname) \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ - const inc_t rs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? 1 : NR ); \ - const inc_t cs_ct = ( bli_is_col_stored( rs_c, cs_c ) ? MR : 1 ); \ + const bool_t col_pref = bli_cntx_l3_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c index 084604cdc..bd9972332 100644 --- a/frame/base/bli_cntx.c +++ b/frame/base/bli_cntx.c @@ -867,6 +867,30 @@ bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, return r_val; } +bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + // Reference the ukr storage preferences of the corresponding real + // micro-kernel for induced methods. + if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) + dt = bli_datatype_proj_to_real( dt ); + + return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); +} + +bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx ) +{ + // Reference the ukr storage preferences of the corresponding real + // micro-kernel for induced methods. + if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) + dt = bli_datatype_proj_to_real( dt ); + + return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); +} + bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) @@ -880,15 +904,10 @@ bool_t bli_cntx_l3_ukr_dislikes_storage_of( obj_t* obj, { num_t dt = bli_obj_datatype( *obj ); - // Reference the ukr storage preferences of the corresponding real - // micro-kernel for induced methods. - if ( bli_cntx_get_ind_method( cntx ) != BLIS_NAT ) - dt = bli_obj_datatype_proj_to_real( *obj ); - const bool_t ukr_prefers_rows - = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); + = bli_cntx_l3_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool_t ukr_prefers_cols - = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); + = bli_cntx_l3_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool_t r_val = FALSE; if ( bli_obj_is_row_stored( *obj ) && ukr_prefers_cols ) r_val = TRUE; diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h index 8cfd67750..38bff6720 100644 --- a/frame/base/bli_cntx.h +++ b/frame/base/bli_cntx.h @@ -455,6 +455,12 @@ bool_t bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, bool_t bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_prefers_rows_dt( num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx ); +bool_t bli_cntx_l3_ukr_prefers_cols_dt( num_t dt, + l3ukr_t ukr_id, + cntx_t* cntx ); bool_t bli_cntx_l3_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx );