Retired trmm_t control tree definitions, usage.

Details:
- Replaced all trmm_t control tree instances and usage with that of
  gemm_t. This change is similar to the recent retirement of the herk_t
  control tree.
- Tweaked packm blocked variants so that the triangular code does NOT
  assume that k is a multiple of MR (when A is triangular) or NR (when
  B is triangular). This means that bottom-right micro-panels packed for
  trmm will have different zero-padding when k is not already a multiple
  of the relevant register blocksize. While this creates a seemingly
  arbitrary and unnecessary distinction between trmm and trsm packing,
  it actually allows trmm to be handled with one control tree, instead
  of one for left and one for right side cases. Furthermore, since only
  one tree is required, it can now be handled by the gemm tree, and thus
  the trmm control tree definitions can be disposed of entirely.
- Tweaked trmm macro-kernels so that they do NOT inflate k up to a
  multiple of MR (when A is triangular) or NR (when B is triangular).
- Misc. tweaks and cleanups to bli_packm_struc_cxk_4m.c and _3m.c, some
  of which are to facilitate above-mentioned changes whereby k is no
  longer required to be a multiple of register blocksize when packing
  triangular micro-panels.
- Adjusted trmm3 according to above changes.
- Retired trmm_t control tree creation/initialization functions.
This commit is contained in:
Field G. Van Zee
2014-09-08 14:49:50 -05:00
parent 576e9e9255
commit 7b2f469d54
49 changed files with 195 additions and 338 deletions

View File

@@ -336,7 +336,8 @@ void PASTEMAC(ch,varname)( \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \

View File

@@ -391,7 +391,8 @@ void PASTEMAC(ch,varname)( \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \

View File

@@ -210,6 +210,16 @@ void PASTEMAC(ch,varname)( \
p_br, rs_p, cs_p ); \
} \
} \
\
\
/*
if ( bli_is_col_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
else if ( bli_is_row_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
@@ -501,7 +511,6 @@ void PASTEMAC(ch,varname)( \
p, rs_p, cs_p ); \
} \
\
\
}
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )

View File

@@ -557,18 +557,6 @@ void PASTEMAC(ch,varname)( \
inc_t is_p, inc_t ldp \
) \
{ \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
@@ -580,10 +568,24 @@ void PASTEMAC(ch,varname)( \
\
/* Tweak the panel according to its triangular structure */ \
{ \
ctype_r* p_r = ( ctype_r* )p + 0; \
ctype_r* p_i = ( ctype_r* )p + is_p; \
ctype_r* p_rpi = ( ctype_r* )p + 2*is_p; \
\
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
ctype_r* p11_rpi = ( ctype_r* )p + 2*is_p + (j )*ldp; \
ctype_r* p11_r = p_r + (j )*ldp; \
ctype_r* p11_i = p_i + (j )*ldp; \
ctype_r* p11_rpi = p_rpi + (j )*ldp; \
\
dim_t p11_m = m_panel; \
dim_t p11_n = n_panel; \
\
dim_t min_p11_m_n; \
\
if ( diagoffp < 0 ) p11_m -= j; \
else if ( diagoffp > 0 ) p11_n -= j; \
\
min_p11_m_n = bli_min( p11_m, p11_n ); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
@@ -594,21 +596,21 @@ void PASTEMAC(ch,varname)( \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
dim_t i; \
\
PASTEMAC(chr,setd)( 0, \
PASTEMAC(chr,setd)( diagoffp, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
p_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( diagoffp, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p, cs_p ); \
p_i, rs_p, cs_p ); \
\
/* Update the diagonal of the p11 section of the rpi panel.
It simply needs to contain the sum of diagonals of p11_r
and p11_i. */ \
for ( i = 0; i < panel_dim; ++i ) \
for ( i = 0; i < min_p11_m_n; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
@@ -626,7 +628,7 @@ void PASTEMAC(ch,varname)( \
{ \
dim_t i; \
\
for ( i = 0; i < panel_dim; ++i ) \
for ( i = 0; i < min_p11_m_n; ++i ) \
{ \
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
@@ -644,34 +646,33 @@ void PASTEMAC(ch,varname)( \
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
PASTEMAC(chr,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p11_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp11, \
p_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p11_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp11, \
p_i, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p11_rpi, rs_p, cs_p ); \
p_rpi, rs_p, cs_p ); \
} \
} \
}

View File

@@ -529,18 +529,6 @@ void PASTEMAC(ch,varname)( \
inc_t is_p, inc_t ldp \
) \
{ \
bool_t row_stored; \
bool_t col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername)( conjc, \
panel_dim, \
@@ -552,9 +540,12 @@ void PASTEMAC(ch,varname)( \
\
/* Tweak the panel according to its triangular structure */ \
{ \
ctype_r* p_r = ( ctype_r* )p; \
ctype_r* p_i = ( ctype_r* )p + is_p; \
\
dim_t j = bli_abs( diagoffp ); \
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
ctype_r* p11_i = ( ctype_r* )p + is_p + (j )*ldp; \
ctype_r* p11_r = p_r + (j )*ldp; \
ctype_r* p11_i = p_i + (j )*ldp; \
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
@@ -563,16 +554,16 @@ void PASTEMAC(ch,varname)( \
ctype_r kappa_r = PASTEMAC(ch,real)( *kappa ); \
ctype_r kappa_i = PASTEMAC(ch,imag)( *kappa ); \
\
PASTEMAC(chr,setd)( 0, \
PASTEMAC(chr,setd)( diagoffp, \
m_panel, \
n_panel, \
&kappa_r, \
p11_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( 0, \
p_r, rs_p, cs_p ); \
PASTEMAC(chr,setd)( diagoffp, \
m_panel, \
n_panel, \
&kappa_i, \
p11_i, rs_p, cs_p ); \
p_i, rs_p, cs_p ); \
} \
\
\
@@ -600,27 +591,26 @@ void PASTEMAC(ch,varname)( \
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop11 = uploc; \
doff_t diagoffp11 = 0; \
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( uplop11 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
bli_toggle_uplo( uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
\
PASTEMAC(chr,setm)( diagoffp11, \
PASTEMAC(chr,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p11_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp11, \
p_r, rs_p, cs_p ); \
PASTEMAC(chr,setm)( diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop11, \
panel_dim, \
panel_dim, \
uplop, \
m_panel, \
n_panel, \
zero_r, \
p11_i, rs_p, cs_p ); \
p_i, rs_p, cs_p ); \
} \
} \
}

View File

@@ -32,7 +32,6 @@
*/
#include "bli_trmm3m_cntl.h"
#include "bli_trmm3m_entry.h"

View File

@@ -53,18 +53,18 @@ packm_t* trmm3m_l_packb_cntl;
packm_t* trmm3m_r_packa_cntl;
packm_t* trmm3m_r_packb_cntl;
trmm_t* trmm3m_cntl_bp_ke;
gemm_t* trmm3m_cntl_bp_ke;
trmm_t* trmm3m_l_cntl_op_bp;
trmm_t* trmm3m_l_cntl_mm_op;
trmm_t* trmm3m_l_cntl_vl_mm;
gemm_t* trmm3m_l_cntl_op_bp;
gemm_t* trmm3m_l_cntl_mm_op;
gemm_t* trmm3m_l_cntl_vl_mm;
trmm_t* trmm3m_r_cntl_op_bp;
trmm_t* trmm3m_r_cntl_mm_op;
trmm_t* trmm3m_r_cntl_vl_mm;
gemm_t* trmm3m_r_cntl_op_bp;
gemm_t* trmm3m_r_cntl_mm_op;
gemm_t* trmm3m_r_cntl_vl_mm;
trmm_t* trmm3m_l_cntl;
trmm_t* trmm3m_r_cntl;
gemm_t* trmm3m_l_cntl;
gemm_t* trmm3m_r_cntl;
void bli_trmm3m_cntl_init()
@@ -77,7 +77,7 @@ void bli_trmm3m_cntl_init()
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
gemm3m_mr,
gemm3m_mr,
gemm3m_kr,
TRUE, // densify
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
@@ -91,9 +91,9 @@ void bli_trmm3m_cntl_init()
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
gemm3m_mr,
gemm3m_kr,
gemm3m_nr,
FALSE, // already dense
TRUE, // already dense
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm3m_l_cntl;
extern trmm_t* trmm3m_r_cntl;
extern gemm_t* gemm3m_cntl;
void bli_trmm3m_entry( side_t side,
obj_t* alpha,
@@ -43,7 +42,6 @@ void bli_trmm3m_entry( side_t side,
obj_t* b )
{
bli_trmm_front( side, alpha, a, b,
trmm3m_l_cntl,
trmm3m_r_cntl );
gemm3m_cntl );
}

View File

@@ -32,7 +32,6 @@
*/
#include "bli_trmm4m_cntl.h"
#include "bli_trmm4m_entry.h"

View File

@@ -53,18 +53,18 @@ packm_t* trmm4m_l_packb_cntl;
packm_t* trmm4m_r_packa_cntl;
packm_t* trmm4m_r_packb_cntl;
trmm_t* trmm4m_cntl_bp_ke;
gemm_t* trmm4m_cntl_bp_ke;
trmm_t* trmm4m_l_cntl_op_bp;
trmm_t* trmm4m_l_cntl_mm_op;
trmm_t* trmm4m_l_cntl_vl_mm;
gemm_t* trmm4m_l_cntl_op_bp;
gemm_t* trmm4m_l_cntl_mm_op;
gemm_t* trmm4m_l_cntl_vl_mm;
trmm_t* trmm4m_r_cntl_op_bp;
trmm_t* trmm4m_r_cntl_mm_op;
trmm_t* trmm4m_r_cntl_vl_mm;
gemm_t* trmm4m_r_cntl_op_bp;
gemm_t* trmm4m_r_cntl_mm_op;
gemm_t* trmm4m_r_cntl_vl_mm;
trmm_t* trmm4m_l_cntl;
trmm_t* trmm4m_r_cntl;
gemm_t* trmm4m_l_cntl;
gemm_t* trmm4m_r_cntl;
void bli_trmm4m_cntl_init()
@@ -77,7 +77,7 @@ void bli_trmm4m_cntl_init()
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
gemm4m_mr,
gemm4m_mr,
gemm4m_kr,
TRUE, // densify
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
@@ -91,9 +91,9 @@ void bli_trmm4m_cntl_init()
BLIS_VARIANT2,
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
gemm4m_mr,
gemm4m_kr,
gemm4m_nr,
FALSE, // already dense
TRUE, // already dense
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm4m_l_cntl;
extern trmm_t* trmm4m_r_cntl;
extern gemm_t* gemm4m_cntl;
void bli_trmm4m_entry( side_t side,
obj_t* alpha,
@@ -43,7 +42,6 @@ void bli_trmm4m_entry( side_t side,
obj_t* b )
{
bli_trmm_front( side, alpha, a, b,
trmm4m_l_cntl,
trmm4m_r_cntl );
gemm4m_cntl );
}

View File

@@ -32,7 +32,6 @@
*/
#include "bli_trmm_cntl.h"
#include "bli_trmm_check.h"
#include "bli_trmm_entry.h"
#include "bli_trmm_front.h"

View File

@@ -37,7 +37,7 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t b_pack_s;
@@ -136,7 +136,7 @@ void bli_trmm_blk_var1f( obj_t* a,
b_pack,
&BLIS_ONE,
c1_pack,
cntl_sub_trmm( cntl ),
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).

View File

@@ -35,6 +35,6 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,8 +37,8 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
trmm_thrinfo_t* thread)
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
@@ -124,7 +124,7 @@ void bli_trmm_blk_var2b( obj_t* a,
b1_pack,
&BLIS_ONE,
c1_pack,
cntl_sub_trmm( cntl ),
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).

View File

@@ -35,6 +35,6 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,8 +37,8 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
trmm_thrinfo_t* thread)
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
@@ -124,7 +124,7 @@ void bli_trmm_blk_var2f( obj_t* a,
b1_pack,
&BLIS_ONE,
c1_pack,
cntl_sub_trmm( cntl ),
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).

View File

@@ -35,6 +35,6 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,7 +37,7 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t c_pack_s;
@@ -119,7 +119,7 @@ void bli_trmm_blk_var3b( obj_t* a,
b1_pack,
&BLIS_ONE,
c_pack,
cntl_sub_trmm( cntl ),
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
}

View File

@@ -35,6 +35,6 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,7 +37,7 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t c_pack_s;
@@ -119,7 +119,7 @@ void bli_trmm_blk_var3f( obj_t* a,
b1_pack,
&BLIS_ONE,
c_pack,
cntl_sub_trmm( cntl ),
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
}

View File

@@ -35,6 +35,6 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -116,7 +116,7 @@ void bli_trmm_int_check( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl )
gemm_t* cntl )
{
err_t e_val;

View File

@@ -49,5 +49,5 @@ void bli_trmm_int_check( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl );
gemm_t* cntl );

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm_l_cntl;
extern trmm_t* trmm_r_cntl;
extern gemm_t* gemm_cntl;
void bli_trmm_entry( side_t side,
obj_t* alpha,
@@ -43,7 +42,6 @@ void bli_trmm_entry( side_t side,
obj_t* b )
{
bli_trmm_front( side, alpha, a, b,
trmm_l_cntl,
trmm_r_cntl );
gemm_cntl );
}

View File

@@ -38,10 +38,8 @@ void bli_trmm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* l_cntl,
trmm_t* r_cntl )
gemm_t* cntl )
{
trmm_t* cntl;
obj_t a_local;
obj_t b_local;
obj_t c_local;
@@ -101,10 +99,10 @@ void bli_trmm_front( side_t side,
if (
( bli_obj_is_row_stored( c_local ) &&
bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
cntl_gemm_ukrs( l_cntl ) ) ) ||
cntl_gemm_ukrs( cntl ) ) ) ||
( bli_obj_is_col_stored( c_local ) &&
bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
cntl_gemm_ukrs( l_cntl ) ) )
cntl_gemm_ukrs( cntl ) ) )
)
{
bli_toggle_side( side );
@@ -129,9 +127,6 @@ void bli_trmm_front( side_t side,
bli_obj_set_as_root( b_local );
bli_obj_set_as_root( c_local );
// Choose the control tree.
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( bli_is_right( side ) );
dim_t n_threads = thread_num_threads( infos[0] );

View File

@@ -36,6 +36,5 @@ void bli_trmm_front( side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
trmm_t* l_cntl,
trmm_t* r_cntl );
gemm_t* cntl );

View File

@@ -39,7 +39,7 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );
static FUNCPTR_T vars[2][2][4][3] =
@@ -89,7 +89,7 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a_local;

View File

@@ -37,5 +37,5 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -217,12 +217,14 @@ void PASTEMAC(ch,varname)( \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute the storage stride for the triangular matrix A, which is
usually PACKMR. However, in the case of 3m, the storage stride
captures the (PACKMR * 3/2) factor embedded in the panel stride.
Notice that we must first inflate k up to a multiple of MR, since
the panel stride was originally computed using this inflated k
dimension. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
usually PACKMR. However, in the case of 3m, the storage stride
captures the (PACKMR * 3/2) factor embedded in the panel stride.
Note that trmm does NOT require k to be a multiple of MR or NR
(depending on whether A or B is the triangular matrix), so we can
use k as-is. By contrast, trsm must use an "inflated" version of
k since trsm requires that k be a multiple of MR (when A is
triangular) or NR (when B is triangular). */ \
k_full = k; \
ss_a = ps_a / k_full; \
\
/* If there is a zero region above where the diagonal of A intersects the
@@ -238,13 +240,6 @@ void PASTEMAC(ch,varname)( \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* For consistency with the trsm macro-kernels, we inflate k to be a
multiple of MR, if necessary. This is needed because we typically
use the same packm variant for trmm as for trsm, and trsm has this
constraint that k must be a multiple of MR so that it can safely
handle bottom-right corner edges of the triangle. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
@@ -313,7 +308,7 @@ void PASTEMAC(ch,varname)( \
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
@@ -436,6 +431,8 @@ void PASTEMAC(ch,varname)( \
b1 += cstep_b; \
c1 += cstep_c; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( trmm_ll_ker_var2, gemm_ukr_t )

View File

@@ -39,7 +39,7 @@
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
/* Compute the storage stride for the triangular matrix A, which is
usually PACKMR. However, in the case of 3m, the storage stride
captures the (PACKMR * 3/2) factor embedded in the panel stride.
Notice that we must first inflate k up to a multiple of MR, since
the panel stride was originally computed using this inflated k
dimension. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
Note that trmm does NOT require k to be a multiple of MR or NR
(depending on whether A or B is the triangular matrix), so we can
use k as-is. By contrast, trsm must use an "inflated" version of
k since trsm requires that k be a multiple of MR (when A is
triangular) or NR (when B is triangular). */ \
k_full = k; \
ss_a = ps_a / k_full; \
\
/* If there is a zero region to the left of where the diagonal of A
@@ -245,13 +247,6 @@ void PASTEMAC(ch,varname)( \
{ \
m = -diagoffa + k; \
} \
\
/* For consistency with the trsm macro-kernels, we inflate k to be a
multiple of MR, if necessary. This is needed because we typically
use the same packm variant for trmm as for trsm, and trsm has this
constraint that k must be a multiple of MR so that it can safely
handle bottom-right corner edges of the triangle. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \

View File

@@ -39,7 +39,7 @@
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
/* Compute the storage stride for the triangular matrix B, which is
usually PACKNR. However, in the case of 3m, the storage stride
captures the (PACKNR * 3/2) factor embedded in the panel stride.
Notice that we must first inflate k up to a multiple of NR, since
the panel stride was originally computed using this inflated k
dimension. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
Note that trmm does NOT require k to be a multiple of MR or NR
(depending on whether A or B is the triangular matrix), so we can
use k as-is. By contrast, trsm must use an "inflated" version of
k since trsm requires that k be a multiple of MR (when A is
triangular) or NR (when B is triangular). */ \
k_full = k; \
ss_b = ps_b / k_full; \
\
/* If there is a zero region above where the diagonal of B intersects
@@ -245,13 +247,6 @@ void PASTEMAC(ch,varname)( \
{ \
n = diagoffb + k; \
} \
\
/* For consistency with the trsm macro-kernels, we inflate k to be a
multiple of NR, if necessary. This is needed because we typically
use the same packm variant for trmm as for trsm, and trsm has this
constraint that k must be a multiple of NR so that it can safely
handle bottom-right corner edges of the triangle. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \

View File

@@ -39,7 +39,7 @@
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -56,7 +56,7 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -219,10 +219,12 @@ void PASTEMAC(ch,varname)( \
/* Compute the storage stride for the triangular matrix B, which is
usually PACKNR. However, in the case of 3m, the storage stride
captures the (PACKNR * 3/2) factor embedded in the panel stride.
Notice that we must first inflate k up to a multiple of NR, since
the panel stride was originally computed using this inflated k
dimension. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
Note that trmm does NOT require k to be a multiple of MR or NR
(depending on whether A or B is the triangular matrix), so we can
use k as-is. By contrast, trsm must use an "inflated" version of
k since trsm requires that k be a multiple of MR (when A is
triangular) or NR (when B is triangular). */ \
k_full = k; \
ss_b = ps_b / k_full; \
\
/* If there is a zero region to the left of where the diagonal of B
@@ -246,13 +248,6 @@ void PASTEMAC(ch,varname)( \
{ \
k = -diagoffb + n; \
} \
\
/* For consistency with the trsm macro-kernels, we inflate k to be a
multiple of NR, if necessary. This is needed because we typically
use the same packm variant for trmm as for trsm, and trsm has this
constraint that k must be a multiple of NR so that it can safely
handle bottom-right corner edges of the triangle. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \

View File

@@ -39,7 +39,7 @@
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl,
gemm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -50,21 +50,13 @@ extern gemm_t* gemm_cntl_bp_ke;
packm_t* trmm_l_packa_cntl;
packm_t* trmm_l_packb_cntl;
packm_t* trmm_r_packa_cntl;
packm_t* trmm_r_packb_cntl;
trmm_t* trmm_cntl_bp_ke;
trmm_t* trmm_l_cntl_op_bp;
trmm_t* trmm_l_cntl_mm_op;
trmm_t* trmm_l_cntl_vl_mm;
trmm_t* trmm_r_cntl_op_bp;
trmm_t* trmm_r_cntl_mm_op;
trmm_t* trmm_r_cntl_vl_mm;
trmm_t* trmm_l_cntl;
trmm_t* trmm_r_cntl;
void bli_trmm_cntl_init()
@@ -74,10 +66,10 @@ void bli_trmm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to mr.
gemm_mr,
// IMPORTANT: Unlike trsm, trmm does not require a
// "k" dim multiple equal to mr.
gemm_mr,
gemm_kr,
TRUE, // densify
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
@@ -89,40 +81,9 @@ void bli_trmm_cntl_init()
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: m dim multiple here must be mr
// since "k" dim multiple is set to mr above.
gemm_mr,
gemm_nr,
FALSE, // already dense
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL );
// Create control tree objects for packm operations (right side).
trmm_r_packa_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: for consistency with trsm, "k" dim
// multiple is set to nr.
gemm_mr,
gemm_nr,
FALSE, // already dense
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK );
trmm_r_packb_cntl
=
bli_packm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
// IMPORTANT: m dim multiple here must be nr
// since "k" dim multiple is set to nr above.
gemm_nr,
// IMPORTANT: Unlike trsm, trmm does not require a
// "k" dim multiple equal to mr.
gemm_kr,
gemm_nr,
TRUE, // densify
FALSE, // do NOT invert diagonal
@@ -131,7 +92,6 @@ void bli_trmm_cntl_init()
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL );
// Create control tree object for lowest-level block-panel kernel.
trmm_cntl_bp_ke
=
@@ -190,74 +150,20 @@ void bli_trmm_cntl_init()
NULL,
NULL );
// Create control tree object for outer panel (to block-panel)
// problem (right side).
trmm_r_cntl_op_bp
=
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
gemm_mc,
gemm_ukrs,
NULL,
trmm_r_packa_cntl,
trmm_r_packb_cntl,
NULL,
trmm_cntl_bp_ke,
gemm_cntl_bp_ke,
NULL );
// Create control tree object for general problem via multiple
// rank-k (outer panel) updates (right side).
trmm_r_cntl_mm_op
=
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT3,
gemm_kc,
gemm_ukrs,
NULL,
NULL,
NULL,
NULL,
trmm_r_cntl_op_bp,
NULL,
NULL );
// Create control tree object for very large problem via multiple
// general problems (right side).
trmm_r_cntl_vl_mm
=
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
gemm_nc,
gemm_ukrs,
NULL,
NULL,
NULL,
NULL,
trmm_r_cntl_mm_op,
NULL,
NULL );
// Alias the "master" trmm control trees to shorter names.
trmm_l_cntl = trmm_l_cntl_vl_mm;
trmm_r_cntl = trmm_r_cntl_vl_mm;
}
void bli_trmm_cntl_finalize()
{
bli_cntl_obj_free( trmm_l_packa_cntl );
bli_cntl_obj_free( trmm_l_packb_cntl );
bli_cntl_obj_free( trmm_r_packa_cntl );
bli_cntl_obj_free( trmm_r_packb_cntl );
bli_cntl_obj_free( trmm_cntl_bp_ke );
bli_cntl_obj_free( trmm_l_cntl_op_bp );
bli_cntl_obj_free( trmm_l_cntl_mm_op );
bli_cntl_obj_free( trmm_l_cntl_vl_mm );
bli_cntl_obj_free( trmm_r_cntl_op_bp );
bli_cntl_obj_free( trmm_r_cntl_mm_op );
bli_cntl_obj_free( trmm_r_cntl_vl_mm );
}
trmm_t* bli_trmm_cntl_obj_create( impl_t impl_type,

View File

@@ -48,7 +48,7 @@ struct trmm_s
};
typedef struct trmm_s trmm_t;
#define cntl_sub_trmm( cntl ) cntl->sub_trmm
#define cntl_sub_gemm( cntl ) cntl->sub_trmm
void bli_trmm_cntl_init( void );
void bli_trmm_cntl_finalize( void );

View File

@@ -115,7 +115,7 @@ void bli_trmm_ll_blk_var1( obj_t* alpha,
&b_pack,
beta,
&c1_pack,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,

View File

@@ -127,7 +127,7 @@ void bli_trmm_ll_blk_var4( obj_t* alpha,
&b_pack_inc,
beta,
&c1_pack_inc,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
}
// Unpack C1 (if C1 was packed).
@@ -172,7 +172,7 @@ void bli_trmm_ll_blk_var4( obj_t* alpha,
&b_pack,
beta,
&c1_pack,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
else
bli_gemm_int( alpha,
&a1_pack,

View File

@@ -112,7 +112,7 @@ void bli_trmm_lu_blk_var1( obj_t* alpha,
&b_pack,
beta,
&c1_pack,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,

View File

@@ -125,7 +125,7 @@ void bli_trmm_lu_blk_var4( obj_t* alpha,
&b_pack_inc,
beta,
&c1_pack_inc,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
}
// Unpack C1 (if C1 was packed).
@@ -170,7 +170,7 @@ void bli_trmm_lu_blk_var4( obj_t* alpha,
&b_pack,
beta,
&c1_pack,
cntl_sub_trmm( cntl ) );
cntl_sub_gemm( cntl ) );
else
bli_gemm_int( alpha,
&a1_pack,

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm3m_l_cntl;
extern trmm_t* trmm3m_r_cntl;
extern gemm_t* gemm3m_cntl;
void bli_trmm33m_entry( side_t side,
obj_t* alpha,
@@ -45,7 +44,6 @@ void bli_trmm33m_entry( side_t side,
obj_t* c )
{
bli_trmm3_front( side, alpha, a, b, beta, c,
trmm3m_l_cntl,
trmm3m_r_cntl );
gemm3m_cntl );
}

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm4m_l_cntl;
extern trmm_t* trmm4m_r_cntl;
extern gemm_t* gemm4m_cntl;
void bli_trmm34m_entry( side_t side,
obj_t* alpha,
@@ -45,7 +44,6 @@ void bli_trmm34m_entry( side_t side,
obj_t* c )
{
bli_trmm3_front( side, alpha, a, b, beta, c,
trmm4m_l_cntl,
trmm4m_r_cntl );
gemm4m_cntl );
}

View File

@@ -34,8 +34,7 @@
#include "blis.h"
extern trmm_t* trmm_l_cntl;
extern trmm_t* trmm_r_cntl;
extern gemm_t* gemm_cntl;
void bli_trmm3_entry( side_t side,
obj_t* alpha,
@@ -45,7 +44,6 @@ void bli_trmm3_entry( side_t side,
obj_t* c )
{
bli_trmm3_front( side, alpha, a, b, beta, c,
trmm_l_cntl,
trmm_r_cntl );
gemm_cntl );
}

View File

@@ -40,10 +40,8 @@ void bli_trmm3_front( side_t side,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* l_cntl,
trmm_t* r_cntl )
gemm_t* cntl )
{
trmm_t* cntl;
obj_t a_local;
obj_t b_local;
obj_t c_local;
@@ -103,10 +101,10 @@ void bli_trmm3_front( side_t side,
if (
( bli_obj_is_row_stored( c_local ) &&
bli_func_prefers_contig_cols( bli_obj_datatype( c_local ),
cntl_gemm_ukrs( l_cntl ) ) ) ||
cntl_gemm_ukrs( cntl ) ) ) ||
( bli_obj_is_col_stored( c_local ) &&
bli_func_prefers_contig_rows( bli_obj_datatype( c_local ),
cntl_gemm_ukrs( l_cntl ) ) )
cntl_gemm_ukrs( cntl ) ) )
)
{
bli_toggle_side( side );
@@ -131,9 +129,6 @@ void bli_trmm3_front( side_t side,
bli_obj_set_as_root( b_local );
bli_obj_set_as_root( c_local );
// Choose the control tree.
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( FALSE );
dim_t n_threads = thread_num_threads( infos[0] );

View File

@@ -38,5 +38,4 @@ void bli_trmm3_front( side_t side,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* l_cntl,
trmm_t* r_cntl );
gemm_t* cntl );

View File

@@ -57,17 +57,14 @@ void bli_cntl_init( void )
// Level-3
bli_gemm_cntl_init();
bli_trmm_cntl_init();
bli_trsm_cntl_init();
// Level-3 via 4m
bli_gemm4m_cntl_init();
bli_trmm4m_cntl_init();
bli_trsm4m_cntl_init();
// Level-3 via 3m
bli_gemm3m_cntl_init();
bli_trmm3m_cntl_init();
bli_trsm3m_cntl_init();
}
@@ -94,17 +91,14 @@ void bli_cntl_finalize( void )
// Level-3
bli_gemm_cntl_finalize();
bli_trmm_cntl_finalize();
bli_trsm_cntl_finalize();
// Level-3 via 4m
bli_gemm4m_cntl_finalize();
bli_trmm4m_cntl_finalize();
bli_trsm4m_cntl_finalize();
// Level-3 via 3m
bli_gemm3m_cntl_finalize();
bli_trmm3m_cntl_finalize();
bli_trsm3m_cntl_finalize();
}