mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Merge commit 'cfa3db3f' into amd-main
* commit 'cfa3db3f':
Fixed bug in mixed-dt gemm introduced in e9da642.
Removed support for 3m, 4m induced methods.
Updated do_sde.sh to get SDE from GitHub.
Disable SDE testing of old AMD microarchitectures.
Fixed substitution bug in configure.
Allow use of 1m with mixing of row/col-pref ukrs.
AMD-Internal: [CPUPL-2698]
Change-Id: I961f0066243cf26aeb2e174e388b470133cc4a5f
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,7 @@
|
||||
|
||||
// -- Level-3 native micro-kernel prototype redefinitions ----------------------
|
||||
|
||||
// -- prototypes for completely generic level-3 microkernels --
|
||||
// -- Prototypes for completely generic level-3 microkernels --
|
||||
|
||||
#undef gemm_ukr_name
|
||||
#define gemm_ukr_name GENARNAME(gemm)
|
||||
@@ -66,46 +66,7 @@
|
||||
|
||||
// -- Level-3 virtual micro-kernel prototype redefinitions ---------------------
|
||||
|
||||
// -- 3mh --
|
||||
|
||||
#undef gemm3mh_ukr_name
|
||||
#define gemm3mh_ukr_name GENARNAME(gemm3mh)
|
||||
|
||||
// -- 3m1 --
|
||||
|
||||
#undef gemm3m1_ukr_name
|
||||
#define gemm3m1_ukr_name GENARNAME(gemm3m1)
|
||||
#undef gemmtrsm3m1_l_ukr_name
|
||||
#define gemmtrsm3m1_l_ukr_name GENARNAME(gemmtrsm3m1_l)
|
||||
#undef gemmtrsm3m1_u_ukr_name
|
||||
#define gemmtrsm3m1_u_ukr_name GENARNAME(gemmtrsm3m1_u)
|
||||
#undef trsm3m1_l_ukr_name
|
||||
#define trsm3m1_l_ukr_name GENARNAME(trsm3m1_l)
|
||||
#undef trsm3m1_u_ukr_name
|
||||
#define trsm3m1_u_ukr_name GENARNAME(trsm3m1_u)
|
||||
|
||||
// -- 4mh --
|
||||
|
||||
#undef gemm4mh_ukr_name
|
||||
#define gemm4mh_ukr_name GENARNAME(gemm4mh)
|
||||
|
||||
// -- 4mb --
|
||||
|
||||
#undef gemm4mb_ukr_name
|
||||
#define gemm4mb_ukr_name GENARNAME(gemm4mb)
|
||||
|
||||
// -- 4m1 --
|
||||
|
||||
#undef gemm4m1_ukr_name
|
||||
#define gemm4m1_ukr_name GENARNAME(gemm4m1)
|
||||
#undef gemmtrsm4m1_l_ukr_name
|
||||
#define gemmtrsm4m1_l_ukr_name GENARNAME(gemmtrsm4m1_l)
|
||||
#undef gemmtrsm4m1_u_ukr_name
|
||||
#define gemmtrsm4m1_u_ukr_name GENARNAME(gemmtrsm4m1_u)
|
||||
#undef trsm4m1_l_ukr_name
|
||||
#define trsm4m1_l_ukr_name GENARNAME(trsm4m1_l)
|
||||
#undef trsm4m1_u_ukr_name
|
||||
#define trsm4m1_u_ukr_name GENARNAME(trsm4m1_u)
|
||||
// -- Prototypes for induced method level-3 microkernels --
|
||||
|
||||
// -- 1m --
|
||||
|
||||
@@ -184,59 +145,6 @@
|
||||
#undef unpackm_16xk_ker_name
|
||||
#define unpackm_16xk_ker_name GENARNAME(unpackm_16xk)
|
||||
|
||||
#undef packm_2xk_3mis_ker_name
|
||||
#define packm_2xk_3mis_ker_name GENARNAME(packm_2xk_3mis)
|
||||
#undef packm_4xk_3mis_ker_name
|
||||
#define packm_4xk_3mis_ker_name GENARNAME(packm_4xk_3mis)
|
||||
#undef packm_6xk_3mis_ker_name
|
||||
#define packm_6xk_3mis_ker_name GENARNAME(packm_6xk_3mis)
|
||||
#undef packm_8xk_3mis_ker_name
|
||||
#define packm_8xk_3mis_ker_name GENARNAME(packm_8xk_3mis)
|
||||
#undef packm_10xk_3mis_ker_name
|
||||
#define packm_10xk_3mis_ker_name GENARNAME(packm_10xk_3mis)
|
||||
#undef packm_12xk_3mis_ker_name
|
||||
#define packm_12xk_3mis_ker_name GENARNAME(packm_12xk_3mis)
|
||||
#undef packm_14xk_3mis_ker_name
|
||||
#define packm_14xk_3mis_ker_name GENARNAME(packm_14xk_3mis)
|
||||
#undef packm_16xk_3mis_ker_name
|
||||
#define packm_16xk_3mis_ker_name GENARNAME(packm_16xk_3mis)
|
||||
|
||||
#undef packm_2xk_4mi_ker_name
|
||||
#define packm_2xk_4mi_ker_name GENARNAME(packm_2xk_4mi)
|
||||
#undef packm_3xk_4mi_ker_name
|
||||
#define packm_3xk_4mi_ker_name GENARNAME(packm_3xk_4mi)
|
||||
#undef packm_4xk_4mi_ker_name
|
||||
#define packm_4xk_4mi_ker_name GENARNAME(packm_4xk_4mi)
|
||||
#undef packm_6xk_4mi_ker_name
|
||||
#define packm_6xk_4mi_ker_name GENARNAME(packm_6xk_4mi)
|
||||
#undef packm_8xk_4mi_ker_name
|
||||
#define packm_8xk_4mi_ker_name GENARNAME(packm_8xk_4mi)
|
||||
#undef packm_10xk_4mi_ker_name
|
||||
#define packm_10xk_4mi_ker_name GENARNAME(packm_10xk_4mi)
|
||||
#undef packm_12xk_4mi_ker_name
|
||||
#define packm_12xk_4mi_ker_name GENARNAME(packm_12xk_4mi)
|
||||
#undef packm_14xk_4mi_ker_name
|
||||
#define packm_14xk_4mi_ker_name GENARNAME(packm_14xk_4mi)
|
||||
#undef packm_16xk_4mi_ker_name
|
||||
#define packm_16xk_4mi_ker_name GENARNAME(packm_16xk_4mi)
|
||||
|
||||
#undef packm_2xk_rih_ker_name
|
||||
#define packm_2xk_rih_ker_name GENARNAME(packm_2xk_rih)
|
||||
#undef packm_4xk_rih_ker_name
|
||||
#define packm_4xk_rih_ker_name GENARNAME(packm_4xk_rih)
|
||||
#undef packm_6xk_rih_ker_name
|
||||
#define packm_6xk_rih_ker_name GENARNAME(packm_6xk_rih)
|
||||
#undef packm_8xk_rih_ker_name
|
||||
#define packm_8xk_rih_ker_name GENARNAME(packm_8xk_rih)
|
||||
#undef packm_10xk_rih_ker_name
|
||||
#define packm_10xk_rih_ker_name GENARNAME(packm_10xk_rih)
|
||||
#undef packm_12xk_rih_ker_name
|
||||
#define packm_12xk_rih_ker_name GENARNAME(packm_12xk_rih)
|
||||
#undef packm_14xk_rih_ker_name
|
||||
#define packm_14xk_rih_ker_name GENARNAME(packm_14xk_rih)
|
||||
#undef packm_16xk_rih_ker_name
|
||||
#define packm_16xk_rih_ker_name GENARNAME(packm_16xk_rih)
|
||||
|
||||
#undef packm_2xk_1er_ker_name
|
||||
#define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1er)
|
||||
#undef packm_4xk_1er_ker_name
|
||||
@@ -340,7 +248,14 @@
|
||||
PASTEMAC(c,opname), PASTEMAC(z,opname) ); \
|
||||
}
|
||||
|
||||
// -- Helper function for 1m ---------------------------------------------------
|
||||
|
||||
void GENBAINAME(cntx_init_blkszs)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -404,8 +319,8 @@ void GENBARNAME(cntx_init)
|
||||
// NOTE: We set the virtual micro-kernel slots to contain the addresses
|
||||
// of the native micro-kernels. In general, the ukernels in the virtual
|
||||
// ukernel slots are always called, and if the function called happens to
|
||||
// be a virtual micro-kernel, it will then know to find its native
|
||||
// ukernel in the native ukernel slots.
|
||||
// be a virtual micro-kernel, it will then know to find its native ukernel
|
||||
// (i.e., in the native ukernel slots).
|
||||
gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
|
||||
gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
|
||||
@@ -700,10 +615,6 @@ void GENBARNAME(cntx_init)
|
||||
// -- Set miscellaneous fields ---------------------------------------------
|
||||
|
||||
bli_cntx_set_method( BLIS_NAT, cntx );
|
||||
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS, cntx );
|
||||
bli_cntx_set_schema_c_panel( BLIS_NOT_PACKED, cntx );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
@@ -711,7 +622,6 @@ void GENBARNAME(cntx_init)
|
||||
void GENBAINAME(cntx_init)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
@@ -728,41 +638,7 @@ void GENBAINAME(cntx_init)
|
||||
|
||||
funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
|
||||
|
||||
// 3mh, 4mh, and 4mb do not not support trsm.
|
||||
bli_func_init_null( &funcs[ BLIS_GEMMTRSM_L_UKR ] );
|
||||
bli_func_init_null( &funcs[ BLIS_GEMMTRSM_U_UKR ] );
|
||||
bli_func_init_null( &funcs[ BLIS_TRSM_L_UKR ] );
|
||||
bli_func_init_null( &funcs[ BLIS_TRSM_U_UKR ] );
|
||||
|
||||
if ( method == BLIS_3MH )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm3mh_ukr_name );
|
||||
}
|
||||
else if ( method == BLIS_3M1 )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm3m1_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm3m1_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm3m1_u_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm3m1_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm3m1_u_ukr_name );
|
||||
}
|
||||
else if ( method == BLIS_4MH )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm4mh_ukr_name );
|
||||
}
|
||||
else if ( method == BLIS_4M1B )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm4mb_ukr_name );
|
||||
}
|
||||
else if ( method == BLIS_4M1A )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm4m1_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm4m1_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm4m1_u_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm4m1_l_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm4m1_u_ukr_name );
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
if ( method == BLIS_1M )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name );
|
||||
gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
|
||||
@@ -781,7 +657,14 @@ void GENBAINAME(cntx_init)
|
||||
|
||||
// For 1m, we employ an optimization which requires that we copy the native
|
||||
// real domain gemm ukernel function pointers to the corresponding real
|
||||
// domain slots in the virtual gemm ukernel func_t.
|
||||
// domain slots in the virtual gemm ukernel func_t. This optimization allows
|
||||
// us to, under certain conditions, adjust various parameters within the gemm
|
||||
// macrokernel so that the real-domain macrokernel (which will query and use
|
||||
// the real-domain virtual gemm ukernel) can be called instead of calling the
|
||||
// complex-domain macrokernel and the corresponding complex-domain virtual
|
||||
// microkernel. The non-optimized code path would require an extra level of
|
||||
// function call overhead, which can be avoided in most cases (i.e., when
|
||||
// beta has a zero imaginary component and C is either row- or column-stored).
|
||||
if ( method == BLIS_1M )
|
||||
{
|
||||
func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx );
|
||||
@@ -802,40 +685,7 @@ void GENBAINAME(cntx_init)
|
||||
bli_func_init_null( &funcs[ i ] );
|
||||
}
|
||||
|
||||
if ( method == BLIS_3MH || method == BLIS_4MH )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_rih_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_rih_ker_name );
|
||||
}
|
||||
else if ( method == BLIS_3M1 )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_3mis_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_3mis_ker_name );
|
||||
}
|
||||
else if ( method == BLIS_4M1A || method == BLIS_4M1B )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_4mi_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_4mi_ker_name );
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
if ( method == BLIS_1M )
|
||||
{
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_1er_ker_name );
|
||||
gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_1er_ker_name );
|
||||
@@ -865,191 +715,75 @@ void GENBAINAME(cntx_init)
|
||||
|
||||
// Modify the context with cache and register blocksizes (and multiples)
|
||||
// appropriate for the current induced method.
|
||||
if ( method == BLIS_3MH )
|
||||
if ( method == BLIS_1M )
|
||||
{
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 1.0, 1.0,
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else if ( method == BLIS_3M1 )
|
||||
{
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 3.0, 3.0,
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else if ( method == BLIS_4MH )
|
||||
{
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 1.0, 1.0,
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else if ( method == BLIS_4M1B )
|
||||
{
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 2.0, 2.0,
|
||||
BLIS_KC, 1.0, 1.0,
|
||||
BLIS_MC, 2.0, 2.0,
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else if ( method == BLIS_4M1A )
|
||||
{
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 2.0, 2.0,
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
{
|
||||
const bool is_pb = FALSE;
|
||||
//const bool is_pb = FALSE;
|
||||
|
||||
// We MUST set the induced method in the context prior to calling
|
||||
// bli_cntx_l3_ukr_prefers_cols_dt() because that function queries
|
||||
// the induced method. It needs the induced method value in order
|
||||
// to determine whether to evaluate the "prefers column storage"
|
||||
// predicate using the storage preference of the kernel for dt, or
|
||||
// the storage preference of the kernel for the real projection of
|
||||
// dt. Failing to set the induced method here can lead to strange
|
||||
// undefined behavior at runtime if the native complex kernel's
|
||||
// storage preference happens to not equal that of the native real
|
||||
// kernel.
|
||||
bli_cntx_set_method( method, cntx );
|
||||
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_c_bp, 1m_r_pb.
|
||||
|
||||
// Set the pack_t schemas for the c_bp or r_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 2.0, 2.0, // halve mc...
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithms 1m_r_bp, 1m_c_pb.
|
||||
|
||||
// Set the pack_t schemas for the r_bp or c_pb algorithms.
|
||||
if ( !is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_1R, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_1E, cntx );
|
||||
}
|
||||
else // if ( is_pb )
|
||||
{
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_ROW_PANELS_1E, cntx );
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_COL_PANELS_1R, cntx );
|
||||
}
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, 6,
|
||||
BLIS_NC, 2.0, 2.0, // halve nc...
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
// Call a helper function to initialize blocksizes for each complex
|
||||
// datatype.
|
||||
GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx );
|
||||
GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx );
|
||||
}
|
||||
else // if ( method == BLIS_NAT )
|
||||
{
|
||||
// No change in blocksizes needed for native execution.
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// -- Set misc. other fields -----------------------------------------------
|
||||
void GENBAINAME(cntx_init_blkszs)
|
||||
(
|
||||
ind_t method,
|
||||
num_t dt,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
// We MUST set the induced method in the context prior to calling
|
||||
// bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries
|
||||
// the induced method. That function needs the induced method value in
|
||||
// order to determine whether to evaluate the "prefers column storage"
|
||||
// predicate using the storage preference of the kernel for dt, or
|
||||
// the storage preference of the kernel for the real projection of
|
||||
// dt. Failing to set the induced method here can lead to strange
|
||||
// undefined behavior at runtime if the native complex kernel's
|
||||
// storage preference happens to not equal that of the native real
|
||||
// kernel.
|
||||
bli_cntx_set_method( method, cntx );
|
||||
|
||||
if ( method == BLIS_3MH )
|
||||
// Initialize the blocksizes according to the micro-kernel preference as
|
||||
// well as the algorithm.
|
||||
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// Schemas vary with _stage().
|
||||
}
|
||||
else if ( method == BLIS_3M1 )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_3MI, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_3MI, cntx );
|
||||
}
|
||||
else if ( method == BLIS_4MH )
|
||||
{
|
||||
// Schemas vary with _stage().
|
||||
}
|
||||
else if ( method == BLIS_4M1A || method == BLIS_4M1B )
|
||||
{
|
||||
bli_cntx_set_schema_a_block( BLIS_PACKED_ROW_PANELS_4MI, cntx );
|
||||
bli_cntx_set_schema_b_panel( BLIS_PACKED_COL_PANELS_4MI, cntx );
|
||||
}
|
||||
else if ( method == BLIS_1M )
|
||||
{
|
||||
//const bool is_pb = FALSE;
|
||||
// This branch is used for algorithm 1m_c_bp.
|
||||
|
||||
// Set the anti-preference field to TRUE when executing a panel-block
|
||||
// algorithm, and FALSE otherwise. This will cause higher-level generic
|
||||
// code to establish (if needed) disagreement between the storage of C and
|
||||
// the micro-kernel output preference so that the two will come back into
|
||||
// agreement in the panel-block macro-kernel (which implemented in terms
|
||||
// of the block-panel macro-kernel with some induced transpositions).
|
||||
//bli_cntx_set_anti_pref( is_pb, cntx );
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, dt, 6,
|
||||
BLIS_NC, 1.0, 1.0,
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 2.0, 2.0, // halve mc...
|
||||
BLIS_NR, 1.0, 1.0,
|
||||
BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
else // if ( method == BLIS_NAT )
|
||||
else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
// This branch is used for algorithm 1m_r_bp.
|
||||
|
||||
bli_cntx_set_ind_blkszs
|
||||
(
|
||||
method, dt, 6,
|
||||
BLIS_NC, 2.0, 2.0, // halve nc...
|
||||
BLIS_KC, 2.0, 2.0, // halve kc...
|
||||
BLIS_MC, 1.0, 1.0,
|
||||
BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
|
||||
BLIS_MR, 1.0, 1.0,
|
||||
BLIS_KR, 1.0, 1.0,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,336 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ab_rpi[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ab; \
|
||||
inc_t cs_ab; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
ctype_r* restrict a_rpi = ( ctype_r* )a + 2*is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
ctype_r* restrict b_rpi = ( ctype_r* )b + 2*is_b; \
|
||||
\
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t n_iter; \
|
||||
dim_t n_elem; \
|
||||
\
|
||||
inc_t incc, ldc; \
|
||||
inc_t incab, ldab; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 3m method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* An optimization: Set local strides and loop bounds based on the
|
||||
strides of c, so that (a) the micro-kernel accesses ct the same
|
||||
way it would if it were updating c directly, and (b) c is updated
|
||||
contiguously. For c with general stride, we access ct the same way
|
||||
we would as if it were column-stored. */ \
|
||||
if ( bli_is_row_stored( rs_c, cs_c ) ) \
|
||||
{ \
|
||||
rs_ab = n; n_iter = m; incc = cs_c; \
|
||||
cs_ab = 1; n_elem = n; ldc = rs_c; \
|
||||
} \
|
||||
else /* column-stored or general stride */ \
|
||||
{ \
|
||||
rs_ab = 1; n_iter = n; incc = rs_c; \
|
||||
cs_ab = m; n_elem = m; ldc = cs_c; \
|
||||
} \
|
||||
incab = 1; \
|
||||
ldab = n_elem; \
|
||||
\
|
||||
\
|
||||
/* The following gemm micro-kernel calls implement all "phases" of the
|
||||
3m method:
|
||||
|
||||
c = beta * c;
|
||||
c_r += + a_r * b_r - a_i * b_i;
|
||||
c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, data ); \
|
||||
\
|
||||
/* ab_r = alpha_r * a_r * b_r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
ab_r, rs_ab, cs_ab, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_rpi, b_rpi, data ); \
|
||||
\
|
||||
/* ab_i = alpha_r * a_i * b_i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_i, \
|
||||
b_i, \
|
||||
zero_r, \
|
||||
ab_i, rs_ab, cs_ab, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
/* ct_i = alpha_r * a_ri * b_ri; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_rpi, \
|
||||
b_rpi, \
|
||||
zero_r, \
|
||||
ab_rpi, rs_ab, cs_ab, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* How we accumulate the intermediate matrix products stored in ab_r,
|
||||
ab_i, and ab_rpi depends on the value of beta. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* c = beta * c;
|
||||
c_r = c_r + ab_r - ab_i;
|
||||
c_i = c_i + ab_rpi - ab_r - ab_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
ctype_r gamma11t_r; \
|
||||
ctype_r gamma11t_i; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( alphabeta11_r, \
|
||||
-alphabeta11_r, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,subris)( alphabeta11_i, \
|
||||
alphabeta11_i, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta11_rpi, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,xpbyris)( gamma11t_r, \
|
||||
gamma11t_i, \
|
||||
beta_r, \
|
||||
beta_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ab_r - ab_i;
|
||||
c_i = c_i + ab_rpi - ab_r - ab_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
ctype_r gamma11t_r; \
|
||||
ctype_r gamma11t_i; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( alphabeta11_r, \
|
||||
-alphabeta11_r, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,subris)( alphabeta11_i, \
|
||||
alphabeta11_i, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta11_rpi, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,addris)( gamma11t_r, \
|
||||
gamma11t_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = beta_r * c_r + ab_r - ab_i;
|
||||
c_i = beta_r * c_i + ab_rpi - ab_r - ab_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
ctype_r gamma11t_r; \
|
||||
ctype_r gamma11t_i; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( alphabeta11_r, \
|
||||
-alphabeta11_r, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,subris)( alphabeta11_i, \
|
||||
alphabeta11_i, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta11_rpi, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if ( PASTEMAC(chr,eq0)( beta_r ) ) */ \
|
||||
{ \
|
||||
/* c_r = ab_r - ab_i;
|
||||
c_i = ab_rpi - ab_r - ab_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r alphabeta11_r = *(ab_r + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_i = *(ab_i + i*incab + j*ldab); \
|
||||
const ctype_r alphabeta11_rpi = *(ab_rpi + i*incab + j*ldab); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
ctype_r gamma11t_r; \
|
||||
ctype_r gamma11t_i; \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( alphabeta11_r, \
|
||||
-alphabeta11_r, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,subris)( alphabeta11_i, \
|
||||
alphabeta11_i, \
|
||||
gamma11t_r, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta11_rpi, \
|
||||
gamma11t_i ); \
|
||||
\
|
||||
PASTEMAC(ch,copyris)( gamma11t_r, \
|
||||
gamma11t_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( gemm3m1, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -1,297 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ct; \
|
||||
inc_t cs_ct; \
|
||||
\
|
||||
ctype_r* restrict a_cast = ( ctype_r* )a; \
|
||||
\
|
||||
ctype_r* restrict b_cast = ( ctype_r* )b; \
|
||||
\
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
const pack_t schema = bli_auxinfo_schema_a( data ); \
|
||||
\
|
||||
dim_t n_iter; \
|
||||
dim_t n_elem; \
|
||||
\
|
||||
inc_t incc, ldc; \
|
||||
inc_t incct, ldct; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 3mh method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* An optimization: Set local strides and loop bounds based on the
|
||||
strides of c, so that (a) the micro-kernel accesses ct the same
|
||||
way it would if it were updating c directly, and (b) c is updated
|
||||
contiguously. For c with general stride, we access ct the same way
|
||||
we would as if it were column-stored. */ \
|
||||
if ( bli_is_row_stored( rs_c, cs_c ) ) \
|
||||
{ \
|
||||
rs_ct = n; n_iter = m; incc = cs_c; \
|
||||
cs_ct = 1; n_elem = n; ldc = rs_c; \
|
||||
} \
|
||||
else /* column-stored or general stride */ \
|
||||
{ \
|
||||
rs_ct = 1; n_iter = n; incc = rs_c; \
|
||||
cs_ct = m; n_elem = m; ldc = cs_c; \
|
||||
} \
|
||||
incct = 1; \
|
||||
ldct = n_elem; \
|
||||
\
|
||||
\
|
||||
/* The following gemm micro-kernel call implements one "phase" of the
|
||||
3m method:
|
||||
|
||||
c = beta * c;
|
||||
c_r += + a_r * b_r - a_i * b_i;
|
||||
c_i += (a_r + a_i)(b_r + b_i) - a_r * b_r - a_i * b_i;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
\
|
||||
/* ct = alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_cast, \
|
||||
b_cast, \
|
||||
zero_r, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: ct", 4, 4, ct, rs_ct, cs_ct, "%4.1f", "" );*/ \
|
||||
\
|
||||
/* How we accumulate the intermediate matrix product stored in ct
|
||||
depends on (a) the schemas of A and B (they are always the same),
|
||||
and (b) the value of beta. */ \
|
||||
if ( bli_is_ro_packed( schema ) ) \
|
||||
{ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* c = beta * c;
|
||||
c_r = c_r + ct;
|
||||
c_i = c_i - ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(ch,xpbyris)( gamma11t, \
|
||||
-gamma11t, \
|
||||
beta_r, \
|
||||
beta_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ct;
|
||||
c_i = c_i - ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = beta_r * c_r + ct;
|
||||
c_i = beta_r * c_i - ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,xpbys)( -gamma11t, beta_r, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = ct;
|
||||
c_i = -ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( bli_is_io_packed( schema ) ) \
|
||||
{ \
|
||||
if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r - ct;
|
||||
c_i = c_i - ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,subs)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = -ct;
|
||||
c_i = -ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( -gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_rpi_packed( schema ) ) */ \
|
||||
{ \
|
||||
if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + 0;
|
||||
c_i = c_i + ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = 0;
|
||||
c_i = ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm3mh_ukr: c", 4, 4, c, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: b1", k, n, b_cast, n, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm3mh_ukr: a1", m, k, a_cast, 1, m, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( gemm3mh, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
@@ -1,291 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ct_r[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ct_i[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ct; \
|
||||
inc_t cs_ct; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
ctype_r m_alpha_r = -(*alpha_r); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t n_iter; \
|
||||
dim_t n_elem; \
|
||||
\
|
||||
inc_t incc, ldc; \
|
||||
inc_t incct, ldct; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: ap_r", m, k, \
|
||||
a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: ap_i", m, k, \
|
||||
a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: bp_r", k, n, \
|
||||
b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4m1_ukr: bp_i", k, n, \
|
||||
b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 4m method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* An optimization: Set local strides and loop bounds based on the
|
||||
strides of c, so that (a) the micro-kernel accesses ct the same
|
||||
way it would if it were updating c directly, and (b) c is updated
|
||||
contiguously. For c with general stride, we access ct the same way
|
||||
we would as if it were column-stored. */ \
|
||||
if ( bli_is_row_stored( rs_c, cs_c ) ) \
|
||||
{ \
|
||||
rs_ct = n; n_iter = m; incc = cs_c; \
|
||||
cs_ct = 1; n_elem = n; ldc = rs_c; \
|
||||
} \
|
||||
else /* column-stored or general stride */ \
|
||||
{ \
|
||||
rs_ct = 1; n_iter = n; incc = rs_c; \
|
||||
cs_ct = m; n_elem = m; ldc = cs_c; \
|
||||
} \
|
||||
incct = 1; \
|
||||
ldct = n_elem; \
|
||||
\
|
||||
\
|
||||
/* The following gemm micro-kernel calls implement all "phases" of
|
||||
the 4m method:
|
||||
|
||||
c = beta * c;
|
||||
c_r += a_r * b_r - a_i * b_i;
|
||||
c_i += a_r * b_i + a_i * b_r;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_r, b_i, data ); \
|
||||
\
|
||||
/* ct_r = alpha_r * a_r * b_r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_r, data ); \
|
||||
\
|
||||
/* ct_i = alpha_r * a_r * b_i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_i, \
|
||||
zero_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, data ); \
|
||||
\
|
||||
/* ct_i += alpha_r * a_i * b_r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_i, \
|
||||
b_r, \
|
||||
one_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
/* ct_r += -alpha_r * a_i * b_i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
&m_alpha_r, \
|
||||
a_i, \
|
||||
b_i, \
|
||||
one_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* How we accumulate the intermediate matrix product stored in ct_r
|
||||
and ct_i depends on the value of beta. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* c = beta * c + ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(ch,xpbyris)( gamma11t_r, \
|
||||
gamma11t_i, \
|
||||
beta_r, \
|
||||
beta_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ct_r; */ \
|
||||
/* c_i = c_i + ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,adds)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = beta_r * c_r + ct_r; */ \
|
||||
/* c_i = beta_r * c_i + ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = ct_r; */ \
|
||||
/* c_i = ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( gemm4m1, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -1,345 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ct_r[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ct_i[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ct; \
|
||||
inc_t cs_ct; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
\
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \
|
||||
\
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t n_iter; \
|
||||
dim_t n_elem; \
|
||||
\
|
||||
inc_t incc, ldc; \
|
||||
inc_t incct, ldct; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 4mb method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* An optimization: Set local strides and loop bounds based on the
|
||||
strides of c, so that (a) the micro-kernel accesses ct the same
|
||||
way it would if it were updating c directly, and (b) c is updated
|
||||
contiguously. For c with general stride, we access ct the same way
|
||||
we would as if it were column-stored. */ \
|
||||
if ( bli_is_row_stored( rs_c, cs_c ) ) \
|
||||
{ \
|
||||
rs_ct = n; n_iter = m; incc = cs_c; \
|
||||
cs_ct = 1; n_elem = n; ldc = rs_c; \
|
||||
} \
|
||||
else /* column-stored or general stride */ \
|
||||
{ \
|
||||
rs_ct = 1; n_iter = n; incc = rs_c; \
|
||||
cs_ct = m; n_elem = m; ldc = cs_c; \
|
||||
} \
|
||||
incct = 1; \
|
||||
ldct = n_elem; \
|
||||
\
|
||||
\
|
||||
\
|
||||
if ( bli_is_ro_packed( schema_b ) ) \
|
||||
{ \
|
||||
/* The following gemm micro-kernel calls implement the first half of
|
||||
the 4mb method (which uses b_r):
|
||||
|
||||
c = beta * c;
|
||||
c_r += a_r * b_r;
|
||||
c_i += a_i * b_r;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_r, data ); \
|
||||
\
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_i, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else /* if ( bli_is_io_packed( schema_b ) ) */ \
|
||||
{ \
|
||||
/* The following gemm micro-kernel calls implement the second half of
|
||||
the 4mb method (which uses b_i):
|
||||
|
||||
c_r += -a_i * b_i;
|
||||
c_i += a_r * b_i;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, data ); \
|
||||
\
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_i, \
|
||||
zero_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
&m_alpha_r, \
|
||||
a_i, \
|
||||
b_i, \
|
||||
zero_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
\
|
||||
/* How we accumulate the intermediate matrix product stored in ct_r
|
||||
and ct_i depends on (a) the schema of B, and (b) the value of
|
||||
beta. */ \
|
||||
if ( bli_is_ro_packed( schema_b ) ) \
|
||||
{ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* c = beta * c + ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(ch,xpbyris)( gamma11t_r, \
|
||||
gamma11t_i, \
|
||||
beta_r, \
|
||||
beta_i, \
|
||||
*gamma11_r, \
|
||||
*gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ct_r; */ \
|
||||
/* c_i = c_i + ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,adds)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = beta_r * c_r + ct_r; */ \
|
||||
/* c_i = beta_r * c_i + ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,xpbys)( gamma11t_r, beta_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,xpbys)( gamma11t_i, beta_r, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = ct_r; */ \
|
||||
/* c_i = ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_io_packed( schema_b ) ) */ \
|
||||
{ \
|
||||
/* NOTE: If this branch executes, it means we are in the second
|
||||
half of the 4mb computation in which we multiply the b_i
|
||||
sub-panel by the entire block of A. Here, we know that beta
|
||||
will either be equal to one (for interior cases within gemm
|
||||
macro-kernel), or zero (for edge cases). */ \
|
||||
\
|
||||
if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ct_r; */ \
|
||||
/* c_i = c_i + ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,adds)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = ct_r; */ \
|
||||
/* c_i = ct_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t_r = *(ct_r + i*incct + j*ldct); \
|
||||
const ctype_r gamma11t_i = *(ct_i + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( gamma11t_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( gamma11t_i, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: b1_r", k, n, b_r, n, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: b1_i", k, n, b_i, n, 1, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: a1_r", m, k, a_r, 1, m, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: a1_i", m, k, a_i, 1, m, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: ct_r", 8, 6, ct_r, rs_ct, cs_ct, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemm4mb_ukr: ct_i", 8, 6, ct_i, rs_ct, cs_ct, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( gemm4mb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
@@ -1,286 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
inc_t rs_ct; \
|
||||
inc_t cs_ct; \
|
||||
\
|
||||
ctype_r* restrict a_cast = ( ctype_r* )a; \
|
||||
\
|
||||
ctype_r* restrict b_cast = ( ctype_r* )b; \
|
||||
\
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
const ctype_r beta_r = PASTEMAC(ch,real)( *beta ); \
|
||||
const ctype_r beta_i = PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
const pack_t schema_a = bli_auxinfo_schema_a( data ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
dim_t n_iter; \
|
||||
dim_t n_elem; \
|
||||
\
|
||||
inc_t incc, ldc; \
|
||||
inc_t incct, ldct; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 4mh method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* An optimization: Set local strides and loop bounds based on the
|
||||
strides of c, so that (a) the micro-kernel accesses ct the same
|
||||
way it would if it were updating c directly, and (b) c is updated
|
||||
contiguously. For c with general stride, we access ct the same way
|
||||
we would as if it were column-stored. */ \
|
||||
if ( bli_is_row_stored( rs_c, cs_c ) ) \
|
||||
{ \
|
||||
rs_ct = n; n_iter = m; incc = cs_c; \
|
||||
cs_ct = 1; n_elem = n; ldc = rs_c; \
|
||||
} \
|
||||
else /* column-stored or general stride */ \
|
||||
{ \
|
||||
rs_ct = 1; n_iter = n; incc = rs_c; \
|
||||
cs_ct = m; n_elem = m; ldc = cs_c; \
|
||||
} \
|
||||
incct = 1; \
|
||||
ldct = n_elem; \
|
||||
\
|
||||
\
|
||||
/* The following gemm micro-kernel call implement one "phase" of the
|
||||
4m method:
|
||||
|
||||
c = beta * c;
|
||||
c_r += a_r * b_r - a_i * b_i;
|
||||
c_i += a_r * b_i + a_i * b_r;
|
||||
|
||||
NOTE: Scaling by alpha_r is not shown above, but is implemented
|
||||
below. */ \
|
||||
\
|
||||
\
|
||||
/* ct = alpha_r * a * b; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_r, \
|
||||
a_cast, \
|
||||
b_cast, \
|
||||
zero_r, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* How we accumulate the intermediate matrix product stored in ct
|
||||
depends on (a) the schemas of A and B, and (b) the value of
|
||||
beta. */ \
|
||||
if ( bli_is_ro_packed( schema_a ) && \
|
||||
bli_is_ro_packed( schema_b ) ) \
|
||||
{ \
|
||||
if ( !PASTEMAC(chr,eq0)( beta_i ) ) \
|
||||
{ \
|
||||
/* c = beta * c;
|
||||
c_r = c_r + ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(ch,scals)( *beta, *gamma11 ); \
|
||||
PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \
|
||||
} \
|
||||
} \
|
||||
else if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + ct;
|
||||
c_i = c_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t, *gamma11_r ); \
|
||||
} \
|
||||
} \
|
||||
else if ( !PASTEMAC(chr,eq0)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = beta_r * c_r + ct;
|
||||
c_i = beta_r * c_i; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,xpbys)( gamma11t, beta_r, *gamma11_r ); \
|
||||
PASTEMAC(chr,scals)( beta_r, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = ct;
|
||||
c_i = 0; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,set0s)( *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else if ( ( bli_is_ro_packed( schema_a ) && \
|
||||
bli_is_io_packed( schema_b ) ) || \
|
||||
( bli_is_io_packed( schema_a ) && \
|
||||
bli_is_ro_packed( schema_b ) ) \
|
||||
) \
|
||||
{ \
|
||||
if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r + 0;
|
||||
c_i = c_i + ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = 0;
|
||||
c_i = ct; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *gamma11_r ); \
|
||||
PASTEMAC(chr,copys)( gamma11t, *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_io_packed( schema_a ) && \
|
||||
bli_is_io_packed( schema_b ) ) */ \
|
||||
{ \
|
||||
if ( PASTEMAC(chr,eq1)( beta_r ) ) \
|
||||
{ \
|
||||
/* c_r = c_r - ct;
|
||||
c_i = c_i + 0; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,subs)( gamma11t, *gamma11_r ); \
|
||||
} \
|
||||
} \
|
||||
else /* if PASTEMAC(chr,eq0)( beta_r ) */ \
|
||||
{ \
|
||||
/* c_r = -ct;
|
||||
c_i = 0; */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( i = 0; i < n_elem; ++i ) \
|
||||
{ \
|
||||
const ctype_r gamma11t = *(ct + i*incct + j*ldct); \
|
||||
ctype* restrict gamma11 = c + i*incc + j*ldc ; \
|
||||
ctype_r* restrict gamma11_r = &PASTEMAC(ch,real)( *gamma11 ); \
|
||||
ctype_r* restrict gamma11_i = &PASTEMAC(ch,imag)( *gamma11 ); \
|
||||
\
|
||||
PASTEMAC(chr,copys)( -gamma11t, *gamma11_r ); \
|
||||
PASTEMAC(chr,set0s)( *gamma11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( gemm4mh, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
@@ -78,7 +78,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
\
|
||||
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
const dim_t k2 = 2 * k; \
|
||||
\
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, trsmkerid ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a1x, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bx1, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
ctype_r ab_r[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ab_i[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype_r ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = mr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a1x_r = ( ctype_r* )a1x; \
|
||||
ctype_r* restrict a1x_i = ( ctype_r* )a1x + is_a; \
|
||||
ctype_r* restrict a1x_ri = ( ctype_r* )a1x + 2*is_a; \
|
||||
\
|
||||
ctype_r* restrict bx1_r = ( ctype_r* )bx1; \
|
||||
ctype_r* restrict bx1_i = ( ctype_r* )bx1 + is_b; \
|
||||
ctype_r* restrict bx1_ri = ( ctype_r* )bx1 + 2*is_b; \
|
||||
\
|
||||
ctype_r* restrict b11_r = ( ctype_r* )b11; \
|
||||
ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \
|
||||
ctype_r* restrict b11_ri = ( ctype_r* )b11 + 2*is_b; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* Copy the contents of c to a temporary buffer ct. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \
|
||||
{ \
|
||||
/* We can handle a non-zero imaginary component on alpha, but to do
|
||||
so we have to manually scale b and then use alpha == 1 for the
|
||||
micro-kernel calls. */ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
PASTEMAC(ch,scalris)( alpha_r, \
|
||||
alpha_i, \
|
||||
*(b11_r + i*rs_b + j*cs_b), \
|
||||
*(b11_i + i*rs_b + j*cs_b) ); \
|
||||
\
|
||||
/* Use alpha.r == 1.0. */ \
|
||||
alpha_r = *one_r; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* lower:
|
||||
b11.r = alpha.r * b11.r - ( + a10.r * b01.r - a10.i * b01.i );
|
||||
b11.i = alpha.r * b11.i - ( a10.ri * b01.ri - a10.r * b01.r - a10.i * b01.i );
|
||||
|
||||
upper:
|
||||
b11.r = alpha.r * b11.r - ( + a12.r * b21.r - a12.i * b21.i );
|
||||
b11.i = alpha.r * b11.i - ( a12.ri * b21.ri - a12.r * b21.r - a12.i * b21.i ); */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a1x_i, bx1_i, data ); \
|
||||
\
|
||||
/* lower: ab.r = a10.r * b01.r;
|
||||
upper: ab.r = a12.r * b21.r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
one_r, \
|
||||
a1x_r, \
|
||||
bx1_r, \
|
||||
zero_r, \
|
||||
ab_r, rs_ab, cs_ab, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a1x_ri, bx1_ri, data ); \
|
||||
\
|
||||
/* lower: ab.i = a10.i * b01.i;
|
||||
upper: ab.i = a12.i * b21.i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
one_r, \
|
||||
a1x_i, \
|
||||
bx1_i, \
|
||||
zero_r, \
|
||||
ab_i, rs_ab, cs_ab, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
/* lower: b11.i = alpha.r * b11.i - a12.ri * b21.ri;
|
||||
upper: b11.i = alpha.r * b11.i - a12.ri * b21.ri; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one_r, \
|
||||
a1x_ri, \
|
||||
bx1_ri, \
|
||||
&alpha_r, \
|
||||
b11_i, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/* b11.r = alpha.r * b11.r - ab.r;
|
||||
b11.r = b11.r + ab.i;
|
||||
b11.i = b11.i + ab.r;
|
||||
b11.i = b11.i + ab.i; */ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \
|
||||
ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \
|
||||
ctype_r beta11_r = *(b11_r + i*rs_b + j*cs_b); \
|
||||
ctype_r beta11_i = *(b11_i + i*rs_b + j*cs_b); \
|
||||
\
|
||||
PASTEMAC(chr,scals)( alpha_r, beta11_r ); \
|
||||
\
|
||||
PASTEMAC(chr,subs)( alphabeta_r, beta11_r ); \
|
||||
PASTEMAC(chr,adds)( alphabeta_i, beta11_r ); \
|
||||
PASTEMAC(chr,adds)( alphabeta_r, beta11_i ); \
|
||||
PASTEMAC(chr,adds)( alphabeta_i, beta11_i ); \
|
||||
\
|
||||
/* Store the local values back to b11. */ \
|
||||
PASTEMAC(ch,copyris)( beta11_r, \
|
||||
beta11_i, \
|
||||
*(b11_r + i*rs_b + j*cs_b), \
|
||||
*(b11_i + i*rs_b + j*cs_b) ); \
|
||||
\
|
||||
/* Update the ri part of b11. */ \
|
||||
PASTEMAC(chr,add3s)( beta11_r, \
|
||||
beta11_i, \
|
||||
*(b11_ri + i*rs_b + j*cs_b) ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
ctrsm_vir_ukr \
|
||||
( \
|
||||
a11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r after", m, n, \
|
||||
b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i after", m, n, \
|
||||
b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_r", k, n, \
|
||||
b01_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b01_i", k, n, \
|
||||
b01_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_r", m, n, \
|
||||
b11_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm3m1_l_ukr: b11_i", m, n, \
|
||||
b11_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC3( gemmtrsm3m1_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
|
||||
INSERT_GENTFUNCCO_BASIC3( gemmtrsm3m1_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
|
||||
@@ -1,230 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, trsmkerid ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a1x, \
|
||||
ctype* restrict a11, \
|
||||
ctype* restrict bx1, \
|
||||
ctype* restrict b11, \
|
||||
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
PASTECH(chr,gemm_ukr_ft) \
|
||||
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
PASTECH(ch,trsm_ukr_ft) \
|
||||
ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a1x_r = ( ctype_r* )a1x; \
|
||||
ctype_r* restrict a1x_i = ( ctype_r* )a1x + is_a; \
|
||||
\
|
||||
ctype_r* restrict bx1_r = ( ctype_r* )bx1; \
|
||||
ctype_r* restrict bx1_i = ( ctype_r* )bx1 + is_b; \
|
||||
\
|
||||
ctype_r* restrict b11_r = ( ctype_r* )b11; \
|
||||
ctype_r* restrict b11_i = ( ctype_r* )b11 + is_b; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
/* A hack to avoid a 'restrict' warning triggered by passing in the
|
||||
same address (one_r) for both alpha and beta when calling the last
|
||||
of the four matrix products. We now use one_r for alpha and this
|
||||
new local variable, onel, for beta. (See issue #328.) */ \
|
||||
ctype_r onel; \
|
||||
ctype_r* restrict onel_r = &onel; \
|
||||
PASTEMAC(chr,set1s)( onel ); \
|
||||
\
|
||||
ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
/*
|
||||
printf( "gemmtrsm4m1_l_ukr: is_a = %lu is_b = %lu\n", is_a, is_b ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1x11p_r", m, k+m, \
|
||||
a1x_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: a1x11p_i", m, k+m, \
|
||||
a1x_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_r", k+m, n, \
|
||||
bx1_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_i", k+m, n, \
|
||||
bx1_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* Copy the contents of c to a temporary buffer ct. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \
|
||||
{ \
|
||||
/* We can handle a non-zero imaginary component on alpha, but to do
|
||||
so we have to manually scale b and then use alpha == 1 for the
|
||||
micro-kernel calls. */ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
PASTEMAC(ch,scalris)( alpha_r, \
|
||||
alpha_i, \
|
||||
*(b11_r + i*rs_b + j*cs_b), \
|
||||
*(b11_i + i*rs_b + j*cs_b) ); \
|
||||
\
|
||||
/* Use alpha.r == 1.0. */ \
|
||||
alpha_r = *one_r; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* lower: b11.r = alpha.r * b11.r - ( a10.r * b01.r - a10.i * b01.i );
|
||||
b11.i = alpha.r * b11.i - ( a10.r * b01.i + a10.i * b01.r );
|
||||
|
||||
upper: b11.r = alpha.r * b11.r - ( a12.r * b21.r - a12.i * b21.i );
|
||||
b11.i = alpha.r * b11.i - ( a12.r * b21.i + a12.i * b21.r ); */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a1x_r, bx1_i, data ); \
|
||||
\
|
||||
/* lower: b11.r = alpha.r * b11.r - a10.r * b01.r;
|
||||
upper: b11.r = alpha.r * b11.r - a12.r * b21.r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one_r, \
|
||||
a1x_r, \
|
||||
bx1_r, \
|
||||
&alpha_r, \
|
||||
b11_r, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a1x_i, bx1_r, data ); \
|
||||
\
|
||||
/* lower: b11.i = alpha.r * b11.i - a10.r * b01.i;
|
||||
upper: b11.i = alpha.r * b11.i - a12.r * b21.i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one_r, \
|
||||
a1x_r, \
|
||||
bx1_i, \
|
||||
&alpha_r, \
|
||||
b11_i, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a1x_i, bx1_i, data ); \
|
||||
\
|
||||
/* lower: b11.i = 1.0 * b11.i - a10.i * b01.r;
|
||||
upper: b11.i = 1.0 * b11.i - a12.i * b21.r; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one_r, \
|
||||
a1x_i, \
|
||||
bx1_r, \
|
||||
one_r, \
|
||||
b11_i, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, data ); \
|
||||
\
|
||||
/* lower: b11.r = 1.0 * b11.r + a10.i * b01.i;
|
||||
upper: b11.r = 1.0 * b11.r + a12.i * b21.i; */ \
|
||||
rgemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
one_r, \
|
||||
a1x_i, \
|
||||
bx1_i, \
|
||||
onel_r, \
|
||||
b11_r, rs_b, cs_b, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_r post-gemm", k+m, n, \
|
||||
bx1_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_i post-gemm", k+m, n, \
|
||||
bx1_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/* b11 = inv(a11) * b11;
|
||||
c11 = b11; */ \
|
||||
ctrsm_vir_ukr \
|
||||
( \
|
||||
a11, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
data, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_r after", k+m, n, \
|
||||
bx1_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "gemmtrsm4m1_l_ukr: bx111p_i after", k+m, n, \
|
||||
bx1_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC3( gemmtrsm4m1_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
|
||||
INSERT_GENTFUNCCO_BASIC3( gemmtrsm4m1_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
|
||||
@@ -67,7 +67,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
@@ -277,7 +277,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
|
||||
const inc_t ld_a = cs_a; \
|
||||
const inc_t ld_b = rs_b; \
|
||||
\
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx ); \
|
||||
const pack_t schema_b = bli_auxinfo_schema_b( data ); \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
|
||||
@@ -1,283 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
\
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a10t * B0; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype_r beta11c_r = *beta11_r; \
|
||||
ctype_r beta11c_i = *beta11_i; \
|
||||
ctype_r rho11_r; \
|
||||
ctype_r rho11_i; \
|
||||
\
|
||||
/* beta11 = beta11 - a10t * b01; */ \
|
||||
PASTEMAC(chr,set0s)( rho11_r ); \
|
||||
PASTEMAC(chr,set0s)( rho11_i ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \
|
||||
ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \
|
||||
ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \
|
||||
ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpyris)( *alpha10_r, \
|
||||
*alpha10_i, \
|
||||
*beta01_r, \
|
||||
*beta01_i, \
|
||||
rho11_r, \
|
||||
rho11_i ); \
|
||||
} \
|
||||
PASTEMAC(ch,subris)( rho11_r, \
|
||||
rho11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scalris)( *alpha11_r, \
|
||||
*alpha11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,sets)( beta11c_r, \
|
||||
beta11c_i, *gamma11 ); \
|
||||
\
|
||||
/* Store the local values back to b11. */ \
|
||||
PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \
|
||||
PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \
|
||||
\
|
||||
/* Update the ri part of the packed panel. */ \
|
||||
PASTEMAC(chr,add3s)( beta11c_r, \
|
||||
beta11c_i, \
|
||||
*beta11_ri ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( trsm3m1_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
ctype_r* restrict b_ri = ( ctype_r* )b + 2*is_b; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_behind = iter; \
|
||||
\
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a12t * B2; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype_r beta11c_r = *beta11_r; \
|
||||
ctype_r beta11c_i = *beta11_i; \
|
||||
ctype_r rho11_r; \
|
||||
ctype_r rho11_i; \
|
||||
\
|
||||
/* beta11 = beta11 - a12t * b21; */ \
|
||||
PASTEMAC(chr,set0s)( rho11_r ); \
|
||||
PASTEMAC(chr,set0s)( rho11_i ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \
|
||||
ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \
|
||||
ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \
|
||||
ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpyris)( *alpha12_r, \
|
||||
*alpha12_i, \
|
||||
*beta21_r, \
|
||||
*beta21_i, \
|
||||
rho11_r, \
|
||||
rho11_i ); \
|
||||
} \
|
||||
PASTEMAC(ch,subris)( rho11_r, \
|
||||
rho11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scalris)( *alpha11_r, \
|
||||
*alpha11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,sets)( beta11c_r, \
|
||||
beta11c_i, *gamma11 ); \
|
||||
\
|
||||
/* Store the local values back to b11. */ \
|
||||
PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \
|
||||
PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \
|
||||
\
|
||||
/* Update the ri part of the packed panel. */ \
|
||||
PASTEMAC(chr,add3s)( beta11c_r, \
|
||||
beta11c_i, \
|
||||
*beta11_ri ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( trsm3m1_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
@@ -1,284 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_r", m, m, \
|
||||
a_r, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: a11p_i", m, m, \
|
||||
a_i, 1, PASTEMAC(chr,packmr), "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r", m, n, \
|
||||
b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i", m, n, \
|
||||
b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = iter; \
|
||||
n_behind = i; \
|
||||
\
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict a10t_r = a_r + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype_r* restrict a10t_i = a_i + (i )*rs_a + (0 )*cs_a; \
|
||||
ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B0_r = b_r + (0 )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B0_i = b_i + (0 )*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a10t * B0; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b01_r = B0_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b01_i = B0_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype_r beta11c_r = *beta11_r; \
|
||||
ctype_r beta11c_i = *beta11_i; \
|
||||
ctype_r rho11_r; \
|
||||
ctype_r rho11_i; \
|
||||
\
|
||||
/* beta11 = beta11 - a10t * b01; */ \
|
||||
PASTEMAC(chr,set0s)( rho11_r ); \
|
||||
PASTEMAC(chr,set0s)( rho11_i ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a; \
|
||||
ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a; \
|
||||
ctype_r* restrict beta01_r = b01_r + (l )*rs_b; \
|
||||
ctype_r* restrict beta01_i = b01_i + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpyris)( *alpha10_r, \
|
||||
*alpha10_i, \
|
||||
*beta01_r, \
|
||||
*beta01_i, \
|
||||
rho11_r, \
|
||||
rho11_i ); \
|
||||
} \
|
||||
PASTEMAC(ch,subris)( rho11_r, \
|
||||
rho11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scalris)( *alpha11_r, \
|
||||
*alpha11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,sets)( beta11c_r, \
|
||||
beta11c_i, *gamma11 ); \
|
||||
\
|
||||
/* Store the local values back to b11. */ \
|
||||
PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \
|
||||
PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_r after", m, n, \
|
||||
b_r, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "trsm4m1_l_ukr: b11p_i after", m, n, \
|
||||
b_i, PASTEMAC(chr,packnr), 1, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( trsm4m1_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \
|
||||
\
|
||||
void PASTEMAC3(ch,opname,arch,suf) \
|
||||
( \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* restrict data, \
|
||||
cntx_t* restrict cntx \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
\
|
||||
const dim_t mr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const dim_t nr = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_MR, cntx ); \
|
||||
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt_r, BLIS_NR, cntx ); \
|
||||
\
|
||||
const dim_t m = mr; \
|
||||
const dim_t n = nr; \
|
||||
\
|
||||
const inc_t is_a = bli_auxinfo_is_a( data ); \
|
||||
const inc_t is_b = bli_auxinfo_is_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + is_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + is_b; \
|
||||
\
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = packmr; \
|
||||
\
|
||||
const inc_t rs_b = packnr; \
|
||||
const inc_t cs_b = 1; \
|
||||
\
|
||||
dim_t iter, i, j, l; \
|
||||
dim_t n_behind; \
|
||||
\
|
||||
\
|
||||
for ( iter = 0; iter < m; ++iter ) \
|
||||
{ \
|
||||
i = m - iter - 1; \
|
||||
n_behind = iter; \
|
||||
\
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*rs_a + (i )*cs_a; \
|
||||
ctype_r* restrict a12t_r = a_r + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype_r* restrict a12t_i = a_i + (i )*rs_a + (i+1)*cs_a; \
|
||||
ctype_r* restrict b1_r = b_r + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict b1_i = b_i + (i )*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B2_r = b_r + (i+1)*rs_b + (0 )*cs_b; \
|
||||
ctype_r* restrict B2_i = b_i + (i+1)*rs_b + (0 )*cs_b; \
|
||||
\
|
||||
/* b1 = b1 - a12t * B2; */ \
|
||||
/* b1 = b1 / alpha11; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
ctype_r* restrict beta11_r = b1_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict beta11_i = b1_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b21_r = B2_r + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype_r* restrict b21_i = B2_i + (0 )*rs_b + (j )*cs_b; \
|
||||
ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \
|
||||
ctype_r beta11c_r = *beta11_r; \
|
||||
ctype_r beta11c_i = *beta11_i; \
|
||||
ctype_r rho11_r; \
|
||||
ctype_r rho11_i; \
|
||||
\
|
||||
/* beta11 = beta11 - a12t * b21; */ \
|
||||
PASTEMAC(chr,set0s)( rho11_r ); \
|
||||
PASTEMAC(chr,set0s)( rho11_i ); \
|
||||
for ( l = 0; l < n_behind; ++l ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a; \
|
||||
ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a; \
|
||||
ctype_r* restrict beta21_r = b21_r + (l )*rs_b; \
|
||||
ctype_r* restrict beta21_i = b21_i + (l )*rs_b; \
|
||||
\
|
||||
PASTEMAC(ch,axpyris)( *alpha12_r, \
|
||||
*alpha12_i, \
|
||||
*beta21_r, \
|
||||
*beta21_i, \
|
||||
rho11_r, \
|
||||
rho11_i ); \
|
||||
} \
|
||||
PASTEMAC(ch,subris)( rho11_r, \
|
||||
rho11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* beta11 = beta11 / alpha11; */ \
|
||||
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
|
||||
of alpha11, so we can multiply rather than divide. We store
|
||||
the inverse of alpha11 intentionally to avoid expensive
|
||||
division instructions within the micro-kernel. */ \
|
||||
PASTEMAC(ch,scalris)( *alpha11_r, \
|
||||
*alpha11_i, \
|
||||
beta11c_r, \
|
||||
beta11c_i ); \
|
||||
\
|
||||
/* Output final result to matrix c. */ \
|
||||
PASTEMAC(ch,sets)( beta11c_r, \
|
||||
beta11c_i, *gamma11 ); \
|
||||
\
|
||||
/* Store the local values back to b11. */ \
|
||||
PASTEMAC(chr,copys)( beta11c_r, *beta11_r ); \
|
||||
PASTEMAC(chr,copys)( beta11c_i, *beta11_i ); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC2( trsm4m1_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
|
||||
Reference in New Issue
Block a user