mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Retired trmm_t control tree definitions, usage.
Details: - Replaced all trmm_t control tree instances and usage with that of gemm_t. This change is similar to the recent retirement of the herk_t control tree. - Tweaked packm blocked variants so that the triangular code does NOT assume that k is a multiple of MR (when A is triangular) or NR (when B is triangular). This means that bottom-right micro-panels packed for trmm will have different zero-padding when k is not already a multiple of the relevant register blocksize. While this creates a seemingly arbitrary and unnecessary distinction between trmm and trsm packing, it actually allows trmm to be handled with one control tree, instead of one for left and one for right side cases. Furthermore, since only one tree is required, it can now be handled by the gemm tree, and thus the trmm control tree definitions can be disposed of entirely. - Tweaked trmm macro-kernels so that they do NOT inflate k up to a multiple of MR (when A is triangular) or NR (when B is triangular). - Misc. tweaks and cleanups to bli_packm_struc_cxk_4m.c and _3m.c, some of which are to facilitate above-mentioned changes whereby k is no longer required to be a multiple of register blocksize when packing triangular micro-panels. - Adjusted trmm3 according to above changes. - Retired trmm_t control tree creation/initialization functions.
This commit is contained in:
@@ -336,7 +336,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
|
||||
@@ -391,7 +391,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
|
||||
@@ -210,6 +210,16 @@ void PASTEMAC(ch,varname)( \
|
||||
p_br, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
else if ( bli_is_row_packed( schema ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
|
||||
@@ -501,7 +511,6 @@ void PASTEMAC(ch,varname)( \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
|
||||
|
||||
@@ -557,18 +557,6 @@ void PASTEMAC(ch,varname)( \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
@@ -580,10 +568,24 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
ctype_r* p_r = ( ctype_r* )p + 0; \
|
||||
ctype_r* p_i = ( ctype_r* )p + is_p; \
|
||||
ctype_r* p_rpi = ( ctype_r* )p + 2*is_p; \
|
||||
\
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
|
||||
ctype_r* p11_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \
|
||||
ctype_r* p11_r = p_r + (j )*ldp; \
|
||||
ctype_r* p11_i = p_i + (j )*ldp; \
|
||||
ctype_r* p11_rpi = p_rpi + (j )*ldp; \
|
||||
\
|
||||
dim_t p11_m = m_panel; \
|
||||
dim_t p11_n = n_panel; \
|
||||
\
|
||||
dim_t min_p11_m_n; \
|
||||
\
|
||||
if ( diagoffp < 0 ) p11_m -= j; \
|
||||
else if ( diagoffp > 0 ) p11_n -= j; \
|
||||
\
|
||||
min_p11_m_n = bli_min( p11_m, p11_n ); \
|
||||
\
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
@@ -594,21 +596,21 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
dim_t i; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
PASTEMAC(chr,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
p_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p, cs_p ); \
|
||||
p_i, rs_p, cs_p ); \
|
||||
\
|
||||
/* Update the diagonal of the p11 section of the rpi panel.
|
||||
It simply needs to contain the sum of diagonals of p11_r
|
||||
and p11_i. */ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
for ( i = 0; i < min_p11_m_n; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
@@ -626,7 +628,7 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
dim_t i; \
|
||||
\
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
for ( i = 0; i < min_p11_m_n; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
@@ -644,34 +646,33 @@ void PASTEMAC(ch,varname)( \
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
PASTEMAC(chr,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero_r, \
|
||||
p11_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
p_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero_r, \
|
||||
p11_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
p_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero_r, \
|
||||
p11_rpi, rs_p, cs_p ); \
|
||||
p_rpi, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -529,18 +529,6 @@ void PASTEMAC(ch,varname)( \
|
||||
inc_t is_p, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
bool_t row_stored; \
|
||||
bool_t col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername)( conjc, \
|
||||
panel_dim, \
|
||||
@@ -552,9 +540,12 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
ctype_r* p_r = ( ctype_r* )p; \
|
||||
ctype_r* p_i = ( ctype_r* )p + is_p; \
|
||||
\
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
|
||||
ctype_r* p11_r = p_r + (j )*ldp; \
|
||||
ctype_r* p11_i = p_i + (j )*ldp; \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
@@ -563,16 +554,16 @@ void PASTEMAC(ch,varname)( \
|
||||
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
PASTEMAC(chr,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_r, \
|
||||
p11_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
p_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
&kappa_i, \
|
||||
p11_i, rs_p, cs_p ); \
|
||||
p_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
@@ -600,27 +591,26 @@ void PASTEMAC(ch,varname)( \
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
PASTEMAC(chr,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero_r, \
|
||||
p11_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
p_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero_r, \
|
||||
p11_i, rs_p, cs_p ); \
|
||||
p_i, rs_p, cs_p ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_trmm3m_cntl.h"
|
||||
#include "bli_trmm3m_entry.h"
|
||||
|
||||
|
||||
|
||||
@@ -53,18 +53,18 @@ packm_t* trmm3m_l_packb_cntl;
|
||||
packm_t* trmm3m_r_packa_cntl;
|
||||
packm_t* trmm3m_r_packb_cntl;
|
||||
|
||||
trmm_t* trmm3m_cntl_bp_ke;
|
||||
gemm_t* trmm3m_cntl_bp_ke;
|
||||
|
||||
trmm_t* trmm3m_l_cntl_op_bp;
|
||||
trmm_t* trmm3m_l_cntl_mm_op;
|
||||
trmm_t* trmm3m_l_cntl_vl_mm;
|
||||
gemm_t* trmm3m_l_cntl_op_bp;
|
||||
gemm_t* trmm3m_l_cntl_mm_op;
|
||||
gemm_t* trmm3m_l_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm3m_r_cntl_op_bp;
|
||||
trmm_t* trmm3m_r_cntl_mm_op;
|
||||
trmm_t* trmm3m_r_cntl_vl_mm;
|
||||
gemm_t* trmm3m_r_cntl_op_bp;
|
||||
gemm_t* trmm3m_r_cntl_mm_op;
|
||||
gemm_t* trmm3m_r_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm3m_l_cntl;
|
||||
trmm_t* trmm3m_r_cntl;
|
||||
gemm_t* trmm3m_l_cntl;
|
||||
gemm_t* trmm3m_r_cntl;
|
||||
|
||||
|
||||
void bli_trmm3m_cntl_init()
|
||||
@@ -77,7 +77,7 @@ void bli_trmm3m_cntl_init()
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm3m_mr,
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
@@ -91,9 +91,9 @@ void bli_trmm3m_cntl_init()
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
gemm3m_nr,
|
||||
FALSE, // already dense
|
||||
TRUE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm3m_l_cntl;
|
||||
extern trmm_t* trmm3m_r_cntl;
|
||||
extern gemm_t* gemm3m_cntl;
|
||||
|
||||
void bli_trmm3m_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -43,7 +42,6 @@ void bli_trmm3m_entry( side_t side,
|
||||
obj_t* b )
|
||||
{
|
||||
bli_trmm_front( side, alpha, a, b,
|
||||
trmm3m_l_cntl,
|
||||
trmm3m_r_cntl );
|
||||
gemm3m_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_trmm4m_cntl.h"
|
||||
#include "bli_trmm4m_entry.h"
|
||||
|
||||
|
||||
|
||||
@@ -53,18 +53,18 @@ packm_t* trmm4m_l_packb_cntl;
|
||||
packm_t* trmm4m_r_packa_cntl;
|
||||
packm_t* trmm4m_r_packb_cntl;
|
||||
|
||||
trmm_t* trmm4m_cntl_bp_ke;
|
||||
gemm_t* trmm4m_cntl_bp_ke;
|
||||
|
||||
trmm_t* trmm4m_l_cntl_op_bp;
|
||||
trmm_t* trmm4m_l_cntl_mm_op;
|
||||
trmm_t* trmm4m_l_cntl_vl_mm;
|
||||
gemm_t* trmm4m_l_cntl_op_bp;
|
||||
gemm_t* trmm4m_l_cntl_mm_op;
|
||||
gemm_t* trmm4m_l_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm4m_r_cntl_op_bp;
|
||||
trmm_t* trmm4m_r_cntl_mm_op;
|
||||
trmm_t* trmm4m_r_cntl_vl_mm;
|
||||
gemm_t* trmm4m_r_cntl_op_bp;
|
||||
gemm_t* trmm4m_r_cntl_mm_op;
|
||||
gemm_t* trmm4m_r_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm4m_l_cntl;
|
||||
trmm_t* trmm4m_r_cntl;
|
||||
gemm_t* trmm4m_l_cntl;
|
||||
gemm_t* trmm4m_r_cntl;
|
||||
|
||||
|
||||
void bli_trmm4m_cntl_init()
|
||||
@@ -77,7 +77,7 @@ void bli_trmm4m_cntl_init()
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm4m_mr,
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
@@ -91,9 +91,9 @@ void bli_trmm4m_cntl_init()
|
||||
BLIS_VARIANT2,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
gemm4m_nr,
|
||||
FALSE, // already dense
|
||||
TRUE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm4m_l_cntl;
|
||||
extern trmm_t* trmm4m_r_cntl;
|
||||
extern gemm_t* gemm4m_cntl;
|
||||
|
||||
void bli_trmm4m_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -43,7 +42,6 @@ void bli_trmm4m_entry( side_t side,
|
||||
obj_t* b )
|
||||
{
|
||||
bli_trmm_front( side, alpha, a, b,
|
||||
trmm4m_l_cntl,
|
||||
trmm4m_r_cntl );
|
||||
gemm4m_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_trmm_cntl.h"
|
||||
#include "bli_trmm_check.h"
|
||||
#include "bli_trmm_entry.h"
|
||||
#include "bli_trmm_front.h"
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t b_pack_s;
|
||||
@@ -136,7 +136,7 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
|
||||
@@ -35,6 +35,6 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,8 +37,8 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
@@ -124,7 +124,7 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
|
||||
@@ -35,6 +35,6 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,8 +37,8 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
@@ -124,7 +124,7 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
|
||||
@@ -35,6 +35,6 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t c_pack_s;
|
||||
@@ -119,7 +119,7 @@ void bli_trmm_blk_var3b( obj_t* a,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -35,6 +35,6 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t c_pack_s;
|
||||
@@ -119,7 +119,7 @@ void bli_trmm_blk_var3f( obj_t* a,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -35,6 +35,6 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ void bli_trmm_int_check( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
gemm_t* cntl )
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
|
||||
@@ -49,5 +49,5 @@ void bli_trmm_int_check( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
gemm_t* cntl );
|
||||
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm_l_cntl;
|
||||
extern trmm_t* trmm_r_cntl;
|
||||
extern gemm_t* gemm_cntl;
|
||||
|
||||
void bli_trmm_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -43,7 +42,6 @@ void bli_trmm_entry( side_t side,
|
||||
obj_t* b )
|
||||
{
|
||||
bli_trmm_front( side, alpha, a, b,
|
||||
trmm_l_cntl,
|
||||
trmm_r_cntl );
|
||||
gemm_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -38,10 +38,8 @@ void bli_trmm_front( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* l_cntl,
|
||||
trmm_t* r_cntl )
|
||||
gemm_t* cntl )
|
||||
{
|
||||
trmm_t* cntl;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
@@ -101,10 +99,10 @@ void bli_trmm_front( side_t side,
|
||||
if (
|
||||
( bli_obj_is_row_stored( c_local ) &&
|
||||
bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
|
||||
cntl_gemm_ukrs( l_cntl ) ) ) ||
|
||||
cntl_gemm_ukrs( cntl ) ) ) ||
|
||||
( bli_obj_is_col_stored( c_local ) &&
|
||||
bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
|
||||
cntl_gemm_ukrs( l_cntl ) ) )
|
||||
cntl_gemm_ukrs( cntl ) ) )
|
||||
)
|
||||
{
|
||||
bli_toggle_side( side );
|
||||
@@ -129,9 +127,6 @@ void bli_trmm_front( side_t side,
|
||||
bli_obj_set_as_root( b_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Choose the control tree.
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( bli_is_right( side ) );
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
@@ -36,6 +36,5 @@ void bli_trmm_front( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
trmm_t* l_cntl,
|
||||
trmm_t* r_cntl );
|
||||
gemm_t* cntl );
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][2][4][3] =
|
||||
@@ -89,7 +89,7 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
|
||||
@@ -37,5 +37,5 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
@@ -217,12 +217,14 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute the storage stride for the triangular matrix A, which is
|
||||
usually PACKMR. However, in the case of 3m, the storage stride
|
||||
captures the (PACKMR * 3/2) factor embedded in the panel stride.
|
||||
Notice that we must first inflate k up to a multiple of MR, since
|
||||
the panel stride was originally computed using this inflated k
|
||||
dimension. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
usually PACKMR. However, in the case of 3m, the storage stride
|
||||
captures the (PACKMR * 3/2) factor embedded in the panel stride.
|
||||
Note that trmm does NOT require k to be a multiple of MR or NR
|
||||
(depending on whether A or B is the triangular matrix), so we can
|
||||
use k as-is. By contrast, trsm must use an "inflated" version of
|
||||
k since trsm requires that k be a multiple of MR (when A is
|
||||
triangular) or NR (when B is triangular). */ \
|
||||
k_full = k; \
|
||||
ss_a = ps_a / k_full; \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
@@ -238,13 +240,6 @@ void PASTEMAC(ch,varname)( \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* For consistency with the trsm macro-kernels, we inflate k to be a
|
||||
multiple of MR, if necessary. This is needed because we typically
|
||||
use the same packm variant for trmm as for trsm, and trsm has this
|
||||
constraint that k must be a multiple of MR so that it can safely
|
||||
handle bottom-right corner edges of the triangle. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
@@ -313,7 +308,7 @@ void PASTEMAC(ch,varname)( \
|
||||
packed so we can index into the corresponding location in
|
||||
b1. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = diagoffa_i + MR; \
|
||||
k_a1011 = bli_min( diagoffa_i + MR, k ); \
|
||||
\
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
@@ -436,6 +431,8 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm_ll_ker_var2, gemm_ukr_t )
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Compute the storage stride for the triangular matrix A, which is
|
||||
usually PACKMR. However, in the case of 3m, the storage stride
|
||||
captures the (PACKMR * 3/2) factor embedded in the panel stride.
|
||||
Notice that we must first inflate k up to a multiple of MR, since
|
||||
the panel stride was originally computed using this inflated k
|
||||
dimension. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
Note that trmm does NOT require k to be a multiple of MR or NR
|
||||
(depending on whether A or B is the triangular matrix), so we can
|
||||
use k as-is. By contrast, trsm must use an "inflated" version of
|
||||
k since trsm requires that k be a multiple of MR (when A is
|
||||
triangular) or NR (when B is triangular). */ \
|
||||
k_full = k; \
|
||||
ss_a = ps_a / k_full; \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
@@ -245,13 +247,6 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
m = -diagoffa + k; \
|
||||
} \
|
||||
\
|
||||
/* For consistency with the trsm macro-kernels, we inflate k to be a
|
||||
multiple of MR, if necessary. This is needed because we typically
|
||||
use the same packm variant for trmm as for trsm, and trsm has this
|
||||
constraint that k must be a multiple of MR so that it can safely
|
||||
handle bottom-right corner edges of the triangle. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Compute the storage stride for the triangular matrix B, which is
|
||||
usually PACKNR. However, in the case of 3m, the storage stride
|
||||
captures the (PACKNR * 3/2) factor embedded in the panel stride.
|
||||
Notice that we must first inflate k up to a multiple of NR, since
|
||||
the panel stride was originally computed using this inflated k
|
||||
dimension. */ \
|
||||
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
|
||||
Note that trmm does NOT require k to be a multiple of MR or NR
|
||||
(depending on whether A or B is the triangular matrix), so we can
|
||||
use k as-is. By contrast, trsm must use an "inflated" version of
|
||||
k since trsm requires that k be a multiple of MR (when A is
|
||||
triangular) or NR (when B is triangular). */ \
|
||||
k_full = k; \
|
||||
ss_b = ps_b / k_full; \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
@@ -245,13 +247,6 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
n = diagoffb + k; \
|
||||
} \
|
||||
\
|
||||
/* For consistency with the trsm macro-kernels, we inflate k to be a
|
||||
multiple of NR, if necessary. This is needed because we typically
|
||||
use the same packm variant for trmm as for trsm, and trsm has this
|
||||
constraint that k must be a multiple of NR so that it can safely
|
||||
handle bottom-right corner edges of the triangle. */ \
|
||||
if ( k % NR != 0 ) k += NR - ( k % NR ); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Compute the storage stride for the triangular matrix B, which is
|
||||
usually PACKNR. However, in the case of 3m, the storage stride
|
||||
captures the (PACKNR * 3/2) factor embedded in the panel stride.
|
||||
Notice that we must first inflate k up to a multiple of NR, since
|
||||
the panel stride was originally computed using this inflated k
|
||||
dimension. */ \
|
||||
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
|
||||
Note that trmm does NOT require k to be a multiple of MR or NR
|
||||
(depending on whether A or B is the triangular matrix), so we can
|
||||
use k as-is. By contrast, trsm must use an "inflated" version of
|
||||
k since trsm requires that k be a multiple of MR (when A is
|
||||
triangular) or NR (when B is triangular). */ \
|
||||
k_full = k; \
|
||||
ss_b = ps_b / k_full; \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
@@ -246,13 +248,6 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
k = -diagoffb + n; \
|
||||
} \
|
||||
\
|
||||
/* For consistency with the trsm macro-kernels, we inflate k to be a
|
||||
multiple of NR, if necessary. This is needed because we typically
|
||||
use the same packm variant for trmm as for trsm, and trsm has this
|
||||
constraint that k must be a multiple of NR so that it can safely
|
||||
handle bottom-right corner edges of the triangle. */ \
|
||||
if ( k % NR != 0 ) k += NR - ( k % NR ); \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
|
||||
@@ -50,21 +50,13 @@ extern gemm_t* gemm_cntl_bp_ke;
|
||||
packm_t* trmm_l_packa_cntl;
|
||||
packm_t* trmm_l_packb_cntl;
|
||||
|
||||
packm_t* trmm_r_packa_cntl;
|
||||
packm_t* trmm_r_packb_cntl;
|
||||
|
||||
trmm_t* trmm_cntl_bp_ke;
|
||||
|
||||
trmm_t* trmm_l_cntl_op_bp;
|
||||
trmm_t* trmm_l_cntl_mm_op;
|
||||
trmm_t* trmm_l_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm_r_cntl_op_bp;
|
||||
trmm_t* trmm_r_cntl_mm_op;
|
||||
trmm_t* trmm_r_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm_l_cntl;
|
||||
trmm_t* trmm_r_cntl;
|
||||
|
||||
|
||||
void bli_trmm_cntl_init()
|
||||
@@ -74,10 +66,10 @@ void bli_trmm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm_mr,
|
||||
// IMPORTANT: Unlike trsm, trmm does not require a
|
||||
// "k" dim multiple equal to mr.
|
||||
gemm_mr,
|
||||
gemm_kr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
@@ -89,40 +81,9 @@ void bli_trmm_cntl_init()
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm_mr,
|
||||
gemm_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
// Create control tree objects for packm operations (right side).
|
||||
trmm_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm_mr,
|
||||
gemm_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
trmm_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm_nr,
|
||||
// IMPORTANT: Unlike trsm, trmm does not require a
|
||||
// "k" dim multiple equal to mr.
|
||||
gemm_kr,
|
||||
gemm_nr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -131,7 +92,6 @@ void bli_trmm_cntl_init()
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
trmm_cntl_bp_ke
|
||||
=
|
||||
@@ -190,74 +150,20 @@ void bli_trmm_cntl_init()
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem (right side).
|
||||
trmm_r_cntl_op_bp
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm_mc,
|
||||
gemm_ukrs,
|
||||
NULL,
|
||||
trmm_r_packa_cntl,
|
||||
trmm_r_packb_cntl,
|
||||
NULL,
|
||||
trmm_cntl_bp_ke,
|
||||
gemm_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates (right side).
|
||||
trmm_r_cntl_mm_op
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm_kc,
|
||||
gemm_ukrs,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm_r_cntl_op_bp,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems (right side).
|
||||
trmm_r_cntl_vl_mm
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm_nc,
|
||||
gemm_ukrs,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm_r_cntl_mm_op,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" trmm control trees to shorter names.
|
||||
trmm_l_cntl = trmm_l_cntl_vl_mm;
|
||||
trmm_r_cntl = trmm_r_cntl_vl_mm;
|
||||
}
|
||||
|
||||
void bli_trmm_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trmm_l_packa_cntl );
|
||||
bli_cntl_obj_free( trmm_l_packb_cntl );
|
||||
bli_cntl_obj_free( trmm_r_packa_cntl );
|
||||
bli_cntl_obj_free( trmm_r_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( trmm_cntl_bp_ke );
|
||||
|
||||
bli_cntl_obj_free( trmm_l_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm_l_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm_l_cntl_vl_mm );
|
||||
bli_cntl_obj_free( trmm_r_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm_r_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm_r_cntl_vl_mm );
|
||||
}
|
||||
|
||||
trmm_t* bli_trmm_cntl_obj_create( impl_t impl_type,
|
||||
@@ -48,7 +48,7 @@ struct trmm_s
|
||||
};
|
||||
typedef struct trmm_s trmm_t;
|
||||
|
||||
#define cntl_sub_trmm( cntl ) cntl->sub_trmm
|
||||
#define cntl_sub_gemm( cntl ) cntl->sub_trmm
|
||||
|
||||
void bli_trmm_cntl_init( void );
|
||||
void bli_trmm_cntl_finalize( void );
|
||||
@@ -115,7 +115,7 @@ void bli_trmm_ll_blk_var1( obj_t* alpha,
|
||||
&b_pack,
|
||||
beta,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
|
||||
@@ -127,7 +127,7 @@ void bli_trmm_ll_blk_var4( obj_t* alpha,
|
||||
&b_pack_inc,
|
||||
beta,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
}
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
@@ -172,7 +172,7 @@ void bli_trmm_ll_blk_var4( obj_t* alpha,
|
||||
&b_pack,
|
||||
beta,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
else
|
||||
bli_gemm_int( alpha,
|
||||
&a1_pack,
|
||||
|
||||
@@ -112,7 +112,7 @@ void bli_trmm_lu_blk_var1( obj_t* alpha,
|
||||
&b_pack,
|
||||
beta,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
|
||||
@@ -125,7 +125,7 @@ void bli_trmm_lu_blk_var4( obj_t* alpha,
|
||||
&b_pack_inc,
|
||||
beta,
|
||||
&c1_pack_inc,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
}
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
@@ -170,7 +170,7 @@ void bli_trmm_lu_blk_var4( obj_t* alpha,
|
||||
&b_pack,
|
||||
beta,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
cntl_sub_gemm( cntl ) );
|
||||
else
|
||||
bli_gemm_int( alpha,
|
||||
&a1_pack,
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm3m_l_cntl;
|
||||
extern trmm_t* trmm3m_r_cntl;
|
||||
extern gemm_t* gemm3m_cntl;
|
||||
|
||||
void bli_trmm33m_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -45,7 +44,6 @@ void bli_trmm33m_entry( side_t side,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_trmm3_front( side, alpha, a, b, beta, c,
|
||||
trmm3m_l_cntl,
|
||||
trmm3m_r_cntl );
|
||||
gemm3m_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm4m_l_cntl;
|
||||
extern trmm_t* trmm4m_r_cntl;
|
||||
extern gemm_t* gemm4m_cntl;
|
||||
|
||||
void bli_trmm34m_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -45,7 +44,6 @@ void bli_trmm34m_entry( side_t side,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_trmm3_front( side, alpha, a, b, beta, c,
|
||||
trmm4m_l_cntl,
|
||||
trmm4m_r_cntl );
|
||||
gemm4m_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -34,8 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm_l_cntl;
|
||||
extern trmm_t* trmm_r_cntl;
|
||||
extern gemm_t* gemm_cntl;
|
||||
|
||||
void bli_trmm3_entry( side_t side,
|
||||
obj_t* alpha,
|
||||
@@ -45,7 +44,6 @@ void bli_trmm3_entry( side_t side,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_trmm3_front( side, alpha, a, b, beta, c,
|
||||
trmm_l_cntl,
|
||||
trmm_r_cntl );
|
||||
gemm_cntl );
|
||||
}
|
||||
|
||||
|
||||
@@ -40,10 +40,8 @@ void bli_trmm3_front( side_t side,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* l_cntl,
|
||||
trmm_t* r_cntl )
|
||||
gemm_t* cntl )
|
||||
{
|
||||
trmm_t* cntl;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
@@ -103,10 +101,10 @@ void bli_trmm3_front( side_t side,
|
||||
if (
|
||||
( bli_obj_is_row_stored( c_local ) &&
|
||||
bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
|
||||
cntl_gemm_ukrs( l_cntl ) ) ) ||
|
||||
cntl_gemm_ukrs( cntl ) ) ) ||
|
||||
( bli_obj_is_col_stored( c_local ) &&
|
||||
bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
|
||||
cntl_gemm_ukrs( l_cntl ) ) )
|
||||
cntl_gemm_ukrs( cntl ) ) )
|
||||
)
|
||||
{
|
||||
bli_toggle_side( side );
|
||||
@@ -131,9 +129,6 @@ void bli_trmm3_front( side_t side,
|
||||
bli_obj_set_as_root( b_local );
|
||||
bli_obj_set_as_root( c_local );
|
||||
|
||||
// Choose the control tree.
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
@@ -38,5 +38,4 @@ void bli_trmm3_front( side_t side,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* l_cntl,
|
||||
trmm_t* r_cntl );
|
||||
gemm_t* cntl );
|
||||
|
||||
@@ -57,17 +57,14 @@ void bli_cntl_init( void )
|
||||
|
||||
// Level-3
|
||||
bli_gemm_cntl_init();
|
||||
bli_trmm_cntl_init();
|
||||
bli_trsm_cntl_init();
|
||||
|
||||
// Level-3 via 4m
|
||||
bli_gemm4m_cntl_init();
|
||||
bli_trmm4m_cntl_init();
|
||||
bli_trsm4m_cntl_init();
|
||||
|
||||
// Level-3 via 3m
|
||||
bli_gemm3m_cntl_init();
|
||||
bli_trmm3m_cntl_init();
|
||||
bli_trsm3m_cntl_init();
|
||||
}
|
||||
|
||||
@@ -94,17 +91,14 @@ void bli_cntl_finalize( void )
|
||||
|
||||
// Level-3
|
||||
bli_gemm_cntl_finalize();
|
||||
bli_trmm_cntl_finalize();
|
||||
bli_trsm_cntl_finalize();
|
||||
|
||||
// Level-3 via 4m
|
||||
bli_gemm4m_cntl_finalize();
|
||||
bli_trmm4m_cntl_finalize();
|
||||
bli_trsm4m_cntl_finalize();
|
||||
|
||||
// Level-3 via 3m
|
||||
bli_gemm3m_cntl_finalize();
|
||||
bli_trmm3m_cntl_finalize();
|
||||
bli_trsm3m_cntl_finalize();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user