mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge pull request #8 from tlrmchlsmth/master
Added multithreading to most level-3 operations.
This commit is contained in:
@@ -111,16 +111,16 @@
|
||||
// Alignment size used when allocating memory dynamically from the operating
|
||||
// system (eg: posix_memalign()). To disable heap alignment and just use
|
||||
// malloc() instead, set this to 1.
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE 32
|
||||
#define BLIS_HEAP_ADDR_ALIGN_SIZE 64
|
||||
|
||||
// Alignment size used when sizing leading dimensions of dynamically
|
||||
// allocated memory.
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 32
|
||||
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 64
|
||||
|
||||
// Alignment size used when allocating entire blocks of contiguous memory
|
||||
// from the contiguous memory allocator.
|
||||
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
|
||||
|
||||
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32
|
||||
|
||||
|
||||
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
|
||||
@@ -154,12 +154,13 @@
|
||||
// while 64 results in 64-bit integers. Any other value results in use of the
|
||||
// C99 type "long int". Note that this ONLY affects integers used within the
|
||||
// BLAS compatibility layer.
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
|
||||
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
|
||||
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
|
||||
// Underscore is left out to work on BGQ systems
|
||||
#define PASTEF770(name) name //## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name //## _
|
||||
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name //## _
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -54,27 +54,22 @@
|
||||
// (b) NR (for triangular operations such as trmm and trsm).
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_MC_S 1024
|
||||
#define BLIS_DEFAULT_KC_S 2048
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
|
||||
// 16 MPI RANKS CASE:
|
||||
//#define BLIS_DEFAULT_MC_D 256//1024
|
||||
//#define BLIS_DEFAULT_KC_D 512//2048
|
||||
//
|
||||
|
||||
// 1 MPI RANK CASE:
|
||||
#define BLIS_DEFAULT_MC_D 1008
|
||||
#define BLIS_DEFAULT_KC_D 2016
|
||||
#define BLIS_DEFAULT_NC_D 20480
|
||||
#define BLIS_DEFAULT_MC_D 1024
|
||||
#define BLIS_DEFAULT_KC_D 2048
|
||||
#define BLIS_DEFAULT_NC_D 10240
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
#define BLIS_DEFAULT_MC_C 1024
|
||||
#define BLIS_DEFAULT_KC_C 2048
|
||||
#define BLIS_DEFAULT_NC_C 8192
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
#define BLIS_DEFAULT_MC_Z 768
|
||||
#define BLIS_DEFAULT_KC_Z 1536
|
||||
#define BLIS_DEFAULT_NC_Z 10240
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
@@ -87,7 +82,7 @@
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 8
|
||||
#define BLIS_DEFAULT_MR_Z 4
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
@@ -153,7 +148,7 @@
|
||||
// -- Default fusing factors for level-1f operations --
|
||||
|
||||
#define BLIS_L1F_FUSE_FAC_S 8
|
||||
#define BLIS_L1F_FUSE_FAC_D 4
|
||||
#define BLIS_L1F_FUSE_FAC_D 8
|
||||
#define BLIS_L1F_FUSE_FAC_C 4
|
||||
#define BLIS_L1F_FUSE_FAC_Z 2
|
||||
|
||||
@@ -182,7 +177,7 @@
|
||||
#include "bli_gemm_8x8.h"
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8
|
||||
#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt
|
||||
#define BLIS_ZGEMM_UKERNEL bli_zgemm_8x8
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
@@ -36,6 +36,9 @@
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
#define BLIS_TREE_BARRIER
|
||||
#define BLIS_TREE_BARRIER_ARITY 4
|
||||
|
||||
// -- OPERATING SYSTEM ---------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := icc
|
||||
CPPROCFLAGS :=
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
|
||||
CDBGFLAGS :=
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O3
|
||||
@@ -98,7 +98,7 @@ ARFLAGS := cru
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
LDFLAGS := -mmic -lm
|
||||
LDFLAGS := -mmic -lm -openmp
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p )
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -117,31 +119,33 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
void PASTEMAC(ch,varname )( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
@@ -260,7 +264,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
@@ -315,6 +319,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -328,6 +334,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
}\
|
||||
\
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -341,6 +348,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -352,6 +361,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -365,6 +375,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -376,13 +388,13 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p );
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var1 )
|
||||
|
||||
@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
|
||||
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
obj_t* p )
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -98,7 +100,7 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p );
|
||||
bli_packm_blk_var1( c, p, t );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -109,23 +111,26 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
if ( thread_am_ochief( t ) ) {
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
@@ -154,7 +159,8 @@ void bli_packm_blk_var3( obj_t* c,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
@@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
@@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
@@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
@@ -388,6 +398,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -400,6 +412,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
} \
|
||||
@@ -412,6 +425,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -423,6 +438,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
@@ -438,7 +454,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
obj_t* p );
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var3 )
|
||||
|
||||
@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
packm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4);
|
||||
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* p )
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -98,7 +100,7 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p );
|
||||
bli_packm_blk_var1( c, p, t );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -109,23 +111,26 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
if( thread_am_ochief( t ) ) {
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
@@ -154,7 +159,8 @@ void bli_packm_blk_var4( obj_t* c,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
pd_p, ps_p,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
@@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
@@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
@@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
@@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
@@ -395,6 +405,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
@@ -406,6 +418,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -419,6 +432,8 @@ void PASTEMAC(ch,varname)( \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if( packm_thread_my_iter( it, thread ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
@@ -430,6 +445,7 @@ void PASTEMAC(ch,varname)( \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
@@ -453,7 +469,7 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* p );
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
packm_thrinfo_t* t \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var4 )
|
||||
|
||||
@@ -37,7 +37,8 @@
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* p );
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* t );
|
||||
|
||||
static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
@@ -52,7 +53,8 @@ static FUNCPTR_T vars[6][3] =
|
||||
|
||||
void bli_packm_int( obj_t* a,
|
||||
obj_t* p,
|
||||
packm_t* cntl )
|
||||
packm_t* cntl,
|
||||
packm_thrinfo_t* thread )
|
||||
{
|
||||
varnum_t n;
|
||||
impl_t i;
|
||||
@@ -119,6 +121,10 @@ void bli_packm_int( obj_t* a,
|
||||
|
||||
// Invoke the variant with kappa_use.
|
||||
f( a,
|
||||
p );
|
||||
p,
|
||||
thread );
|
||||
|
||||
// Barrier so that packing is done before computation
|
||||
thread_obarrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -34,5 +34,6 @@
|
||||
|
||||
void bli_packm_int( obj_t* a,
|
||||
obj_t* p,
|
||||
packm_t* cntl );
|
||||
packm_t* cntl,
|
||||
packm_thrinfo_t* thread );
|
||||
|
||||
|
||||
64
frame/1m/packm/bli_packm_threading.c
Normal file
64
frame/1m/packm/bli_packm_threading.c
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_packm_thrinfo_free( packm_thrinfo_t* thread )
|
||||
{
|
||||
//Assume that the ocomm and the icomm are freed by something else and don't need to be freed.
|
||||
bli_free(thread);
|
||||
}
|
||||
|
||||
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id )
|
||||
{
|
||||
return (packm_thrinfo_t*) bli_create_thread_info( ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
|
||||
}
|
||||
|
||||
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id )
|
||||
{
|
||||
bli_setup_thread_info( (thrinfo_t*) thread, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
|
||||
}
|
||||
|
||||
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
}
|
||||
54
frame/1m/packm/bli_packm_threading.h
Normal file
54
frame/1m/packm/bli_packm_threading.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
struct packm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
};
|
||||
typedef struct packm_thrinfo_s packm_thrinfo_t;
|
||||
|
||||
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
void bli_packm_thrinfo_free( packm_thrinfo_t* thread );
|
||||
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread );
|
||||
@@ -56,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
|
||||
|
||||
|
||||
void bli_packm_unb_var1( obj_t* c,
|
||||
obj_t* p )
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -98,20 +99,22 @@ void bli_packm_unb_var1( obj_t* c,
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
densify,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
densify,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
*/
|
||||
|
||||
void bli_packm_unb_var1( obj_t* c,
|
||||
obj_t* p );
|
||||
obj_t* p,
|
||||
packm_thrinfo_t* thread );
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
|
||||
@@ -49,7 +49,8 @@ static FUNCPTR_T vars[2][3] =
|
||||
|
||||
void bli_unpackm_int( obj_t* p,
|
||||
obj_t* a,
|
||||
unpackm_t* cntl )
|
||||
unpackm_t* cntl,
|
||||
packm_thrinfo_t* thread )
|
||||
{
|
||||
// The unpackm operation consists of an optional post-process: castm.
|
||||
// (This post-process is analogous to the castm pre-process in packm.)
|
||||
@@ -122,9 +123,12 @@ void bli_unpackm_int( obj_t* p,
|
||||
f = vars[n][i];
|
||||
|
||||
// Invoke the variant.
|
||||
f( p,
|
||||
&c,
|
||||
cntl );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
f( p,
|
||||
&c,
|
||||
cntl );
|
||||
}
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Now, if necessary, we cast the contents of c to matrix a. If casting
|
||||
// was not necessary, then we are done because the call to the unpackm
|
||||
|
||||
@@ -34,7 +34,8 @@
|
||||
|
||||
void bli_unpackm_int( obj_t* p,
|
||||
obj_t* a,
|
||||
unpackm_t* cntl );
|
||||
unpackm_t* cntl,
|
||||
packm_thrinfo_t* thread );
|
||||
|
||||
/*
|
||||
void bli_unpackm_init_cast( obj_t* p,
|
||||
|
||||
@@ -76,7 +76,8 @@ void bli_gemv_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A1, y1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntl_sub_packv_y( cntl ) );
|
||||
|
||||
|
||||
@@ -81,7 +81,8 @@ void bli_gemv_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A1, x1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x( cntl ) );
|
||||
|
||||
|
||||
@@ -75,7 +75,8 @@ void bli_ger_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A1, x1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x( cntl ) );
|
||||
|
||||
@@ -90,7 +91,8 @@ void bli_ger_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/unpack A1 (if A1 was packed).
|
||||
bli_unpackm_int( &a1_pack, &a1,
|
||||
cntl_sub_unpackm_a( cntl ) );
|
||||
cntl_sub_unpackm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -75,7 +75,8 @@ void bli_ger_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A1, y1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntl_sub_packv_y( cntl ) );
|
||||
|
||||
@@ -90,7 +91,8 @@ void bli_ger_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/unpack A1 (if A1 was packed).
|
||||
bli_unpackm_int( &a1_pack, &a1,
|
||||
cntl_sub_unpackm_a( cntl ) );
|
||||
cntl_sub_unpackm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -106,7 +106,8 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
|
||||
@@ -109,7 +109,8 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
|
||||
@@ -106,7 +106,8 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
|
||||
@@ -109,7 +109,8 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
|
||||
@@ -90,7 +90,8 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
@@ -112,7 +113,8 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -90,7 +90,8 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
@@ -112,7 +113,8 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -101,7 +101,8 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
@@ -136,7 +137,8 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -104,7 +104,8 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
@@ -139,7 +140,8 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -104,7 +104,8 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
@@ -139,7 +140,8 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -101,7 +101,8 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntl_sub_packm_c11( cntl ) );
|
||||
cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
@@ -136,7 +137,8 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntl_sub_unpackm_c11( cntl ) );
|
||||
cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
|
||||
@@ -80,7 +80,8 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -80,7 +80,8 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -80,7 +80,8 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -80,7 +80,8 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -85,7 +85,8 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -85,7 +85,8 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -85,7 +85,8 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -85,7 +85,8 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntl_sub_packm_a11( cntl ) );
|
||||
cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
|
||||
@@ -50,7 +50,6 @@
|
||||
#include "bli_gemm4m.h"
|
||||
#include "bli_gemm3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
|
||||
@@ -37,45 +37,64 @@
|
||||
void bli_gemm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b_pack;
|
||||
obj_t c1, c1_pack;
|
||||
//The s is for "lives on the stack"
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize objects passed into bli_packm_init for A and C
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing B.
|
||||
bli_packm_init( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, m_trans,
|
||||
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
|
||||
&start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of a (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -83,38 +102,50 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
i, b_alg, a, &a1 );
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b_pack,
|
||||
a1_pack,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Currently must be done by 1 thread
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( b_pack );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_gemm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,45 +37,63 @@
|
||||
void bli_gemm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range( thread, 0, n_trans,
|
||||
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
|
||||
&start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
// NOTE: Use of b (for execution datatype) is intentional!
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, b,
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
@@ -85,36 +103,48 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_gemm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,37 +37,50 @@
|
||||
void bli_gemm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -85,26 +98,32 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_gemm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
@@ -112,17 +131,25 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 ) bli_obj_scalar_reset( &c_pack );
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( c_pack );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_gemm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -55,7 +55,6 @@ gemm_t* gemm_cntl_vl_mm;
|
||||
|
||||
gemm_t* gemm_cntl;
|
||||
|
||||
|
||||
void bli_gemm_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
|
||||
@@ -74,12 +74,20 @@ void bli_gemm_front( obj_t* alpha,
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_gemm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
@@ -57,7 +58,8 @@ void bli_gemm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
@@ -73,7 +75,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -86,7 +90,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_is_zeros( *a ) ||
|
||||
bli_obj_is_zeros( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -104,22 +110,24 @@ void bli_gemm_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
//if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
// }
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
@@ -133,6 +141,7 @@ void bli_gemm_int( obj_t* alpha,
|
||||
f( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
cntl,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,5 +37,6 @@ void bli_gemm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -45,7 +45,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
gemm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
@@ -54,7 +55,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
void bli_gemm_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -117,7 +119,8 @@ void bli_gemm_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -133,7 +136,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
gemm_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -213,18 +217,21 @@ void PASTEMAC(ch,varname)( \
|
||||
bli_auxinfo_set_ps_a( ps_a, aux ); \
|
||||
bli_auxinfo_set_ps_b( ps_b, aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
@@ -232,18 +239,21 @@ void PASTEMAC(ch,varname)( \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -282,13 +292,7 @@ void PASTEMAC(ch,varname)( \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_gemm_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -57,7 +58,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
gemm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ker_var2 )
|
||||
|
||||
@@ -54,7 +54,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5);
|
||||
void bli_gemm_ker_var5( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl )
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_gemm_ker_var5( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
gemm_t* cntl );
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
|
||||
203
frame/3/gemm/bli_gemm_threading.c
Normal file
203
frame/3/gemm/bli_gemm_threading.c
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_gemm = sub_gemm;
|
||||
}
|
||||
|
||||
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_gemm = thread;
|
||||
}
|
||||
|
||||
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm )
|
||||
{
|
||||
gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc( sizeof( gemm_thrinfo_t ) );
|
||||
bli_setup_gemm_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_gemm );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread)
|
||||
{
|
||||
if( thread == NULL || thread == &BLIS_GEMM_SINGLE_THREADED ) return;
|
||||
|
||||
// Free Communicators
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_free_communicator( thread->ocomm );
|
||||
|
||||
// Free Sub Thrinfos
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_gemm_thrinfo_free( thread->sub_gemm );
|
||||
bli_free( thread );
|
||||
|
||||
return;
|
||||
}
|
||||
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num )
|
||||
{
|
||||
for( int i = 0; i < num; i++)
|
||||
bli_gemm_thrinfo_free( threads[i] );
|
||||
bli_free( threads );
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( )
|
||||
{
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
#else
|
||||
dim_t jc_way = 1;
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t jr_way = 1;
|
||||
dim_t ir_way = 1;
|
||||
#endif
|
||||
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
gemm_thrinfo_t** paths = (gemm_thrinfo_t**) malloc( global_num_threads * sizeof( gemm_thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
gemm_thrinfo_t* ir_info = bli_create_gemm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
gemm_thrinfo_t* jr_info = bli_create_gemm_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
gemm_thrinfo_t* ic_info = bli_create_gemm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
|
||||
gemm_thrinfo_t* kc_info = bli_create_gemm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
|
||||
gemm_thrinfo_t* jc_info = bli_create_gemm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
78
frame/3/gemm/bli_gemm_threading.h
Normal file
78
frame/3/gemm/bli_gemm_threading.h
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct gemm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct gemm_thrinfo_s* sub_gemm;
|
||||
};
|
||||
typedef struct gemm_thrinfo_s gemm_thrinfo_t;
|
||||
|
||||
#define gemm_thread_sub_gemm( thread ) thread->sub_gemm
|
||||
#define gemm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in gemm micro-kernel
|
||||
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
|
||||
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads );
|
||||
|
||||
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm );
|
||||
|
||||
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm );
|
||||
|
||||
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread );
|
||||
@@ -80,12 +80,20 @@ void bli_hemm_front( side_t side,
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_gemm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -109,20 +109,34 @@ void bli_her2k_front( obj_t* alpha,
|
||||
&c_local,
|
||||
cntl );
|
||||
#else
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
bli_herk_int( alpha,
|
||||
&a_local,
|
||||
&bh_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
|
||||
bli_herk_int( &alpha_conj,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
&BLIS_ONE,
|
||||
&c_local,
|
||||
cntl );
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&bh_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
&alpha_conj,
|
||||
&b_local,
|
||||
&ah_local,
|
||||
&BLIS_ONE,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -37,42 +37,60 @@
|
||||
void bli_herk_blk_var1f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t ah_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t ah_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* a1_pack;
|
||||
obj_t* c1_pack;
|
||||
obj_t* ah_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &ah_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A'.
|
||||
bli_obj_init_pack( &ah_pack_s );
|
||||
bli_packm_init( ah, &ah_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
ah_pack = thread_obroadcast( thread, &ah_pack_s );
|
||||
|
||||
// Initialize pack objects that are passed into packm_init() for A and C.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A' (if instructed).
|
||||
bli_packm_int( ah, ah_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
m_trans = bli_obj_length_after_trans( *c );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A'.
|
||||
bli_packm_init( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack A' (if instructed).
|
||||
bli_packm_int( ah, &ah_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted( thread, 0, m_trans,
|
||||
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_is_upper( *c ), &start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = 0; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -82,36 +100,47 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&ah_pack,
|
||||
a1_pack,
|
||||
ah_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_herk( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &ah_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( ah_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_herk_blk_var1f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,49 +37,68 @@
|
||||
void bli_herk_blk_var2f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack, aS_pack;
|
||||
obj_t ah1, ah1_pack;
|
||||
obj_t c1;
|
||||
obj_t c1S, c1S_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t ah1_pack_s, c1S_pack_s;
|
||||
|
||||
obj_t ah1, c1, c1S;
|
||||
obj_t aS_pack;
|
||||
obj_t* a_pack;
|
||||
obj_t* ah1_pack;
|
||||
obj_t* c1S_pack;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
subpart_t stored_part;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &ah1_pack );
|
||||
bli_obj_init_pack( &c1S_pack );
|
||||
|
||||
// The upper and lower variants are identical, except for which
|
||||
// merged subpartition is acquired in the loop body.
|
||||
if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B;
|
||||
else stored_part = BLIS_SUBPART1T;
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *c );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
// Initialize pack objects for C and A' that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &ah1_pack_s );
|
||||
bli_obj_init_pack( &c1S_pack_s );
|
||||
}
|
||||
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
|
||||
c1S_pack = thread_ibroadcast( thread, &c1S_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *c );
|
||||
dim_t start, end;
|
||||
|
||||
// Needs to be replaced with a weighted range because triangle
|
||||
bli_get_range_weighted( thread, 0, n_trans,
|
||||
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_is_lower( *c ), &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1' and C1.
|
||||
@@ -90,42 +109,53 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
|
||||
// Partition off the stored region of C1 and the corresponding region
|
||||
// of A_pack.
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, &c1, &c1S );
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, &a_pack, &aS_pack );
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, &c1, &c1S );
|
||||
bli_acquire_mpart_t2b( stored_part,
|
||||
i, b_alg, a_pack, &aS_pack );
|
||||
|
||||
// Initialize objects for packing A1' and C1.
|
||||
bli_packm_init( &ah1, &ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1S, &c1S_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &ah1, ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1S, c1S_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread ) ;
|
||||
|
||||
// Pack A1' (if instructed).
|
||||
bli_packm_int( &ah1, &ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &ah1, ah1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1S, &c1S_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1S, c1S_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) ) ;
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
&aS_pack,
|
||||
&ah1_pack,
|
||||
ah1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1S_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
c1S_pack,
|
||||
cntl_sub_herk( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1S_pack, &c1S,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_unpackm_int( c1S_pack, &c1S,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &ah1_pack );
|
||||
bli_obj_release_pack( &c1S_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( ah1_pack );
|
||||
bli_obj_release_pack( c1S_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_herk_blk_var2f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,37 +37,50 @@
|
||||
void bli_herk_blk_var3f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t ah1, ah1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, ah1_pack_s;
|
||||
|
||||
obj_t a1, ah1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* ah1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing C.
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &ah1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &ah1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -82,44 +95,59 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
i, b_alg, ah, &ah1 );
|
||||
|
||||
// Initialize objects for packing A1 and A1'.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &ah1, &ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &ah1, ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &ah1, &ah1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &ah1, ah1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&ah1_pack,
|
||||
a1_pack,
|
||||
ah1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_herk( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_herk( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
// only for the first iteration (and then BLIS_ONE for all others).
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 ) thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
// only for the first iteration (and then BLIS_ONE for all others).
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
if ( i == 0 ) bli_obj_scalar_reset( &c_pack );
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &ah1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( ah1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_herk_blk_var3f( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -77,12 +77,20 @@ void bli_herk_front( obj_t* alpha,
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_herk_int( alpha,
|
||||
&a_local,
|
||||
&ah_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&ah_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* ah,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][4][3] =
|
||||
{
|
||||
@@ -66,7 +67,8 @@ void bli_herk_int( obj_t* alpha,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t ah_local;
|
||||
@@ -83,7 +85,9 @@ void bli_herk_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *ah ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -105,22 +109,22 @@ void bli_herk_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar
|
||||
// attached to A'.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &ah_local );
|
||||
bli_obj_scalar_apply_scalar( alpha, &ah_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar
|
||||
// attached to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set a bool based on the uplo field of C's root object.
|
||||
@@ -138,6 +142,7 @@ void bli_herk_int( obj_t* alpha,
|
||||
f( &a_local,
|
||||
&ah_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
cntl,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,5 +37,6 @@ void bli_herk_int( obj_t* alpha,
|
||||
obj_t* ah,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
herk_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
void bli_herk_l_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -121,7 +123,8 @@ void bli_herk_l_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
herk_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
@@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
@@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_herk_l_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
herk_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( herk_l_ker_var2 )
|
||||
|
||||
203
frame/3/herk/bli_herk_threading.c
Normal file
203
frame/3/herk/bli_herk_threading.c
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_herk = sub_herk;
|
||||
}
|
||||
|
||||
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_herk = thread;
|
||||
}
|
||||
|
||||
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk )
|
||||
{
|
||||
herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) );
|
||||
bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_herk );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_herk_thrinfo_free( herk_thrinfo_t* thread)
|
||||
{
|
||||
if( thread == NULL ) return;
|
||||
|
||||
// Free Communicators
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_free_communicator( thread->ocomm );
|
||||
|
||||
// Free Sub Thrinfos
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_herk_thrinfo_free( thread->sub_herk );
|
||||
bli_free( thread );
|
||||
|
||||
return;
|
||||
}
|
||||
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num )
|
||||
{
|
||||
for( int i = 0; i < num; i++)
|
||||
bli_herk_thrinfo_free( threads[i] );
|
||||
bli_free( threads );
|
||||
}
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
{
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
#else
|
||||
dim_t jc_way = 1;
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t jr_way = 1;
|
||||
dim_t ir_way = 1;
|
||||
#endif
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
herk_thrinfo_t** paths = (herk_thrinfo_t**) malloc( global_num_threads * sizeof( herk_thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
|
||||
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
|
||||
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
79
frame/3/herk/bli_herk_threading.h
Normal file
79
frame/3/herk/bli_herk_threading.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct herk_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct herk_thrinfo_s* sub_herk;
|
||||
};
|
||||
typedef struct herk_thrinfo_s herk_thrinfo_t;
|
||||
|
||||
#define herk_thread_sub_herk( thread ) thread->sub_herk
|
||||
#define herk_thread_sub_opackm( thread ) thread->opackm
|
||||
#define herk_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in herk micro-kernel
|
||||
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
|
||||
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads );
|
||||
|
||||
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk );
|
||||
|
||||
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk );
|
||||
|
||||
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread );
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
herk_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
void bli_herk_u_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
herk_t* cntl )
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -121,7 +123,8 @@ void bli_herk_u_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
herk_thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
@@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
@@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_herk_u_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
herk_t* cntl );
|
||||
herk_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
herk_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( herk_u_ker_var2 )
|
||||
|
||||
@@ -79,12 +79,20 @@ void bli_symm_front( side_t side,
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_gemm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -93,19 +93,31 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
cntl );
|
||||
#else
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
bli_herk_int( alpha,
|
||||
&a_local,
|
||||
&bt_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
bli_herk_int( alpha,
|
||||
&b_local,
|
||||
&at_local,
|
||||
&BLIS_ONE,
|
||||
&c_local,
|
||||
cntl );
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&bt_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&b_local,
|
||||
&at_local,
|
||||
&BLIS_ONE,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -72,13 +72,21 @@ void bli_syrk_front( obj_t* alpha,
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_herk_int( alpha,
|
||||
&a_local,
|
||||
&at_local,
|
||||
beta,
|
||||
&c_local,
|
||||
cntl );
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&at_local,
|
||||
beta,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,21 +37,48 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t a1, c1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t m_trans;
|
||||
dim_t offA;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
m_trans = bli_obj_length_after_trans( *a );
|
||||
@@ -66,24 +93,16 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
|
||||
bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing B.
|
||||
bli_packm_init( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, &b_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted( thread, offA, m_trans,
|
||||
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
|
||||
bli_obj_is_upper( *c ), &start, &end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
for ( i = offA; i < m_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
b_alg = bli_determine_blocksize_f( i, end, a,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
@@ -93,36 +112,47 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b_pack,
|
||||
a1_pack,
|
||||
b_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( b_pack );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,42 +37,60 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted( thread, 0, n_trans,
|
||||
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
|
||||
bli_obj_is_upper( *c ), &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, n_trans, b,
|
||||
b_alg = bli_determine_blocksize_b( i, end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
@@ -82,36 +100,47 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,42 +37,60 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread)
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c1, c1_pack;
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
|
||||
obj_t b1, c1;
|
||||
obj_t* a_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c1_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t n_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c1_pack );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
n_trans = bli_obj_width_after_trans( *b );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing A.
|
||||
bli_packm_init( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, &a_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
dim_t start, end;
|
||||
bli_get_range_weighted( thread, 0, n_trans,
|
||||
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
|
||||
bli_obj_is_lower( *c ), &start, &end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
for ( i = start; i < end; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, b,
|
||||
b_alg = bli_determine_blocksize_f( i, end, b,
|
||||
cntl_blocksize( cntl ) );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
@@ -82,36 +100,47 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, &c1_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a_pack,
|
||||
&b1_pack,
|
||||
a_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c1_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c1_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( &c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c1_pack );
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_obj_release_pack( a_pack );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_obj_release_pack( b1_pack );
|
||||
bli_obj_release_pack( c1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,37 +37,50 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -82,36 +95,49 @@ void bli_trmm_blk_var3b( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
thread_obarrier( thread );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ochief( thread ) ){
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var3b( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -37,37 +37,50 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a1, a1_pack;
|
||||
obj_t b1, b1_pack;
|
||||
obj_t c_pack;
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
|
||||
obj_t a1, b1;
|
||||
obj_t* a1_pack = NULL;
|
||||
obj_t* b1_pack = NULL;
|
||||
obj_t* c_pack = NULL;
|
||||
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
bli_obj_init_pack( &a1_pack );
|
||||
bli_obj_init_pack( &b1_pack );
|
||||
bli_obj_init_pack( &c_pack );
|
||||
if( thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntl_sub_scalm( cntl ) );
|
||||
|
||||
// Initialize object for packing C.
|
||||
bli_packm_init( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, &c_pack,
|
||||
cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Partition along the k dimension.
|
||||
for ( i = 0; i < k_trans; i += b_alg )
|
||||
{
|
||||
@@ -82,36 +95,49 @@ void bli_trmm_blk_var3f( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, &b1_pack,
|
||||
cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
&a1_pack,
|
||||
&b1_pack,
|
||||
a1_pack,
|
||||
b1_pack,
|
||||
&BLIS_ONE,
|
||||
&c_pack,
|
||||
cntl_sub_trmm( cntl ) );
|
||||
c_pack,
|
||||
cntl_sub_trmm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
}
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( &c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ) );
|
||||
thread_obarrier( thread );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_obj_release_pack( &a1_pack );
|
||||
bli_obj_release_pack( &b1_pack );
|
||||
bli_obj_release_pack( &c_pack );
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ochief( thread ) ){
|
||||
bli_obj_release_pack( c_pack );
|
||||
}
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_obj_release_pack( a1_pack );
|
||||
bli_obj_release_pack( b1_pack );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,5 +35,6 @@
|
||||
void bli_trmm_blk_var3f( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -125,12 +125,20 @@ void bli_trmm_front( side_t side,
|
||||
if ( bli_is_left( side ) ) cntl = l_cntl;
|
||||
else cntl = r_cntl;
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_trmm_int( alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
&BLIS_ZERO,
|
||||
&c_local,
|
||||
cntl );
|
||||
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( !bli_is_left( side ) );
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
(level3_int_t) bli_trmm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
&BLIS_ZERO,
|
||||
&c_local,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_trmm_thrinfo_free_paths( infos, n_threads );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][2][4][3] =
|
||||
{
|
||||
@@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
@@ -105,7 +107,9 @@ void bli_trmm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
bli_scalm( beta, c );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -127,22 +131,22 @@ void bli_trmm_int( obj_t* alpha,
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
}
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
@@ -173,6 +177,7 @@ void bli_trmm_int( obj_t* alpha,
|
||||
f( &a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
cntl );
|
||||
cntl,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -271,9 +275,12 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
for ( j = 0; j < n_iter; ++j ) { \
|
||||
\
|
||||
if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
@@ -308,121 +315,124 @@ void PASTEMAC(ch,varname)( \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = diagoffa_i + MR; \
|
||||
\
|
||||
b1_i = b1 + off_a1011 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1011 * ss_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
b1_i = b1 + off_a1011 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
a1 += k_a1011 * ss_a; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
|
||||
{ \
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, aux ); \
|
||||
bli_auxinfo_set_next_b( b2, aux ); \
|
||||
\
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( rstep_a, aux ); \
|
||||
/* Save the panel stride of the current panel of A to the
|
||||
auxinfo_t object. */ \
|
||||
bli_auxinfo_set_ps_a( rstep_a, aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr_cast( k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux ); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_ll_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
@@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
@@ -315,14 +321,15 @@ void PASTEMAC(ch,varname)( \
|
||||
off_a1112 = diagoffa_i; \
|
||||
k_a1112 = k - off_a1112; \
|
||||
\
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
b1_i = b1 + off_a1112 * PACKNR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + k_a1112 * ss_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -369,19 +376,20 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += k_a1112 * ss_a; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_lu_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t off_b1121; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
/*inc_t cstep_b; */\
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t ss_b; \
|
||||
auxinfo_t aux; \
|
||||
@@ -267,7 +271,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
/*cstep_b = ps_b; */\
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
@@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
|
||||
in A. Then compute the length of that panel. */ \
|
||||
off_b1121 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b1121; \
|
||||
\
|
||||
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \
|
||||
a1_i = a1 + off_b1121 * PACKMR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + k_b1121 * ss_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
@@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += k_b1121 * ss_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_rl_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 )
|
||||
|
||||
@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* gemm_ukr
|
||||
void* gemm_ukr,
|
||||
trmm_thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl )
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
gemm_ukr );
|
||||
gemm_ukr,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
@@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t off_b0111; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
/*inc_t cstep_b; */\
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t ss_b; \
|
||||
auxinfo_t aux; \
|
||||
@@ -268,7 +272,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
/*cstep_b = ps_b; */\
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
@@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b0111 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
\
|
||||
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \
|
||||
a1_i = a1 + off_b0111 * PACKMR; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + k_b0111 * ss_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
@@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
} \
|
||||
b1 += k_b0111 * ss_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -39,7 +39,8 @@
|
||||
void bli_trmm_ru_ker_var2( obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
trmm_t* cntl );
|
||||
trmm_t* cntl,
|
||||
trmm_thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
|
||||
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* gemm_ukr \
|
||||
void* gemm_ukr, \
|
||||
trmm_thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 )
|
||||
|
||||
207
frame/3/trmm/bli_trmm_threading.c
Normal file
207
frame/3/trmm/bli_trmm_threading.c
Normal file
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_trmm = sub_trmm;
|
||||
}
|
||||
|
||||
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_trmm = thread;
|
||||
}
|
||||
|
||||
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm )
|
||||
{
|
||||
trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) );
|
||||
bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_trmm );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_trmm_thrinfo_free( trmm_thrinfo_t* thread)
|
||||
{
|
||||
if( thread == NULL ) return;
|
||||
|
||||
// Free Communicators
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_free_communicator( thread->ocomm );
|
||||
|
||||
// Free Sub Thrinfos
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_trmm_thrinfo_free( thread->sub_trmm );
|
||||
bli_free( thread );
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num )
|
||||
{
|
||||
for( int i = 0; i < num; i++)
|
||||
bli_trmm_thrinfo_free( threads[i] );
|
||||
bli_free( threads );
|
||||
}
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
|
||||
if( jc_dependency ){
|
||||
jr_way *= jc_way;
|
||||
jc_way = 1;
|
||||
}
|
||||
#else
|
||||
dim_t jc_way = 1;
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t jr_way = 1;
|
||||
dim_t ir_way = 1;
|
||||
#endif
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
|
||||
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
packb, packa, jr_info);
|
||||
|
||||
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
NULL, NULL, ic_info);
|
||||
|
||||
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
NULL, NULL, kc_info);
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
79
frame/3/trmm/bli_trmm_threading.h
Normal file
79
frame/3/trmm/bli_trmm_threading.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct trmm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct trmm_thrinfo_s* sub_trmm;
|
||||
};
|
||||
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
|
||||
|
||||
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
|
||||
#define trmm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency );
|
||||
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads );
|
||||
|
||||
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
|
||||
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
|
||||
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user