Merge pull request #8 from tlrmchlsmth/master

Added multithreading to most level-3 operations.
This commit is contained in:
Field G. Van Zee
2014-05-20 09:53:19 -05:00
146 changed files with 5669 additions and 1356 deletions

View File

@@ -111,16 +111,16 @@
// Alignment size used when allocating memory dynamically from the operating
// system (eg: posix_memalign()). To disable heap alignment and just use
// malloc() instead, set this to 1.
#define BLIS_HEAP_ADDR_ALIGN_SIZE 32
#define BLIS_HEAP_ADDR_ALIGN_SIZE 64
// Alignment size used when sizing leading dimensions of dynamically
// allocated memory.
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 32
#define BLIS_HEAP_STRIDE_ALIGN_SIZE 64
// Alignment size used when allocating entire blocks of contiguous memory
// from the contiguous memory allocator.
#define BLIS_CONTIG_ADDR_ALIGN_SIZE BLIS_PAGE_SIZE
#define BLIS_CONTIG_STRIDE_ALIGN_SIZE 32
// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
@@ -154,12 +154,13 @@
// while 64 results in 64-bit integers. Any other value results in use of the
// C99 type "long int". Note that this ONLY affects integers used within the
// BLAS compatibility layer.
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 64
#define BLIS_BLAS2BLIS_INT_TYPE_SIZE 32
// Fortran-77 name-mangling macros.
#define PASTEF770(name) name ## _
#define PASTEF77(ch1,name) ch1 ## name ## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _
// Underscore is left out to work on BGQ systems
#define PASTEF770(name) name //## _
#define PASTEF77(ch1,name) ch1 ## name //## _
#define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name //## _

View File

@@ -54,27 +54,22 @@
// (b) NR (for triangular operations such as trmm and trsm).
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_MC_S 1024
#define BLIS_DEFAULT_KC_S 2048
#define BLIS_DEFAULT_NC_S 8192
// 16 MPI RANKS CASE:
//#define BLIS_DEFAULT_MC_D 256//1024
//#define BLIS_DEFAULT_KC_D 512//2048
//
// 1 MPI RANK CASE:
#define BLIS_DEFAULT_MC_D 1008
#define BLIS_DEFAULT_KC_D 2016
#define BLIS_DEFAULT_NC_D 20480
#define BLIS_DEFAULT_MC_D 1024
#define BLIS_DEFAULT_KC_D 2048
#define BLIS_DEFAULT_NC_D 10240
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_C 1024
#define BLIS_DEFAULT_KC_C 2048
#define BLIS_DEFAULT_NC_C 8192
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
#define BLIS_DEFAULT_MC_Z 768
#define BLIS_DEFAULT_KC_Z 1536
#define BLIS_DEFAULT_NC_Z 10240
// -- Register blocksizes --
@@ -87,7 +82,7 @@
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
@@ -153,7 +148,7 @@
// -- Default fusing factors for level-1f operations --
#define BLIS_L1F_FUSE_FAC_S 8
#define BLIS_L1F_FUSE_FAC_D 4
#define BLIS_L1F_FUSE_FAC_D 8
#define BLIS_L1F_FUSE_FAC_C 4
#define BLIS_L1F_FUSE_FAC_Z 2
@@ -182,7 +177,7 @@
#include "bli_gemm_8x8.h"
#define BLIS_DGEMM_UKERNEL bli_dgemm_8x8
#define BLIS_DGEMM_UKERNEL_MT bli_dgemm_8x8_mt
#define BLIS_ZGEMM_UKERNEL bli_zgemm_8x8
// -- trsm-related --

View File

@@ -36,6 +36,9 @@
#define BLIS_CONFIG_H
#define BLIS_TREE_BARRIER
#define BLIS_TREE_BARRIER_ARITY 4
// -- OPERATING SYSTEM ---------------------------------------------------------

View File

@@ -78,7 +78,7 @@ GIT_LOG := $(GIT) log --decorate
# --- Determine the C compiler and related flags ---
CC := icc
CPPROCFLAGS :=
CMISCFLAGS := -mmic -fasm-blocks -std=c99
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
CDBGFLAGS :=
CWARNFLAGS := -Wall
COPTFLAGS := -O3
@@ -98,7 +98,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
LDFLAGS := -mmic -lm
LDFLAGS := -mmic -lm -openmp

View File

@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p
dim_t pd_p, inc_t ps_p,
packm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
void bli_packm_blk_var1( obj_t* c,
obj_t* p )
obj_t* p,
packm_thrinfo_t* t )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -117,31 +119,33 @@ void bli_packm_blk_var1( obj_t* c,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p );
pd_p, ps_p,
t );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
) \
void PASTEMAC(ch,varname )( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
bool_t invdiag, \
bool_t revifup, \
bool_t reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
@@ -260,7 +264,7 @@ void PASTEMAC(ch,varname)( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
@@ -315,6 +319,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk)( strucc, \
diagoffp_i, \
diagc, \
@@ -328,6 +334,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
}\
\
\
p_inc = ldp * panel_len_max_i; \
@@ -341,6 +348,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk)( strucc, \
diagoffc_i, \
uploc, \
@@ -352,6 +361,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -365,6 +375,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -376,13 +388,13 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
\
} \
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
} \
} \
\
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
\
\

View File

@@ -33,7 +33,8 @@
*/
void bli_packm_blk_var1( obj_t* c,
obj_t* p );
obj_t* p,
packm_thrinfo_t* t );
#undef GENTPROT
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( packm_blk_var1 )

View File

@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p
dim_t pd_p, inc_t ps_p,
packm_thrinfo_t* thread
);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
void bli_packm_blk_var3( obj_t* c,
obj_t* p )
obj_t* p,
packm_thrinfo_t* t )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -98,7 +100,7 @@ void bli_packm_blk_var3( obj_t* c,
// in the real domain.
if ( bli_is_real( dt_cp ) )
{
bli_packm_blk_var1( c, p );
bli_packm_blk_var1( c, p, t );
return;
}
@@ -109,23 +111,26 @@ void bli_packm_blk_var3( obj_t* c,
// real domain counterparts. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
if ( thread_am_ochief( t ) ) {
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
@@ -154,7 +159,8 @@ void bli_packm_blk_var3( obj_t* c,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p );
pd_p, ps_p,
t );
}
@@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
@@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
@@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
diagoffp_i, \
diagc, \
@@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
\
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
@@ -388,6 +398,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
diagoffc_i, \
uploc, \
@@ -400,6 +412,7 @@ void PASTEMAC(ch,varname)( \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
\
} \
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
} \
@@ -412,6 +425,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -423,6 +438,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
@@ -438,7 +454,7 @@ void PASTEMAC(ch,varname)( \
\
} \
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
}

View File

@@ -33,7 +33,8 @@
*/
void bli_packm_blk_var3( obj_t* c,
obj_t* p );
obj_t* p,
packm_thrinfo_t* t );
#undef GENTPROTCO
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
);
INSERT_GENTPROTCO_BASIC( packm_blk_var3 )

View File

@@ -52,14 +52,16 @@ typedef void (*FUNCPTR_T)(
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p
dim_t pd_p, inc_t ps_p,
packm_thrinfo_t* thread
);
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4);
void bli_packm_blk_var4( obj_t* c,
obj_t* p )
obj_t* p,
packm_thrinfo_t* t )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -98,7 +100,7 @@ void bli_packm_blk_var4( obj_t* c,
// in the real domain.
if ( bli_is_real( dt_cp ) )
{
bli_packm_blk_var1( c, p );
bli_packm_blk_var1( c, p, t );
return;
}
@@ -109,23 +111,26 @@ void bli_packm_blk_var4( obj_t* c,
// real domain counterparts. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
if( thread_am_ochief( t ) ) {
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
@@ -154,7 +159,8 @@ void bli_packm_blk_var4( obj_t* c,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
pd_p, ps_p );
pd_p, ps_p,
t );
}
@@ -177,7 +183,8 @@ void PASTEMAC(ch,varname)( \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
@@ -297,8 +304,8 @@ void PASTEMAC(ch,varname)( \
\
p_begin = p_cast; \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
@@ -352,6 +359,8 @@ void PASTEMAC(ch,varname)( \
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
diagoffp_i, \
diagc, \
@@ -365,6 +374,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p ); \
} \
\
p_inc = ldp * panel_len_max_i; \
\
@@ -395,6 +405,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
diagoffc_i, \
uploc, \
@@ -406,6 +418,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -419,6 +432,8 @@ void PASTEMAC(ch,varname)( \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if( packm_thread_my_iter( it, thread ) ) \
{ \
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
0, \
BLIS_DENSE, \
@@ -430,6 +445,7 @@ void PASTEMAC(ch,varname)( \
kappa_cast, \
c_begin, rs_c, cs_c, \
p_begin, rs_p, cs_p ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ldp * panel_len_max_i; \
@@ -453,7 +469,7 @@ void PASTEMAC(ch,varname)( \
\
} \
\
p_begin += p_inc; \
p_begin += p_inc; \
} \
}

View File

@@ -33,7 +33,8 @@
*/
void bli_packm_blk_var4( obj_t* c,
obj_t* p );
obj_t* p,
packm_thrinfo_t* t );
#undef GENTPROTCO
@@ -55,7 +56,8 @@ void PASTEMAC(ch,varname)( \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p \
dim_t pd_p, inc_t ps_p, \
packm_thrinfo_t* t \
);
INSERT_GENTPROTCO_BASIC( packm_blk_var4 )

View File

@@ -37,7 +37,8 @@
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* p );
obj_t* p,
packm_thrinfo_t* t );
static FUNCPTR_T vars[6][3] =
{
@@ -52,7 +53,8 @@ static FUNCPTR_T vars[6][3] =
void bli_packm_int( obj_t* a,
obj_t* p,
packm_t* cntl )
packm_t* cntl,
packm_thrinfo_t* thread )
{
varnum_t n;
impl_t i;
@@ -119,6 +121,10 @@ void bli_packm_int( obj_t* a,
// Invoke the variant with kappa_use.
f( a,
p );
p,
thread );
// Barrier so that packing is done before computation
thread_obarrier( thread );
}

View File

@@ -34,5 +34,6 @@
void bli_packm_int( obj_t* a,
obj_t* p,
packm_t* cntl );
packm_t* cntl,
packm_thrinfo_t* thread );

View File

@@ -0,0 +1,64 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packm_thrinfo_free( packm_thrinfo_t* thread )
{
//Assume that the ocomm and the icomm are freed by something else and don't need to be freed.
bli_free(thread);
}
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id )
{
return (packm_thrinfo_t*) bli_create_thread_info( ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
}
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id )
{
bli_setup_thread_info( (thrinfo_t*) thread, ocomm, ocomm_id, icomm, icomm_id, n_way, work_id );
}
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
}

View File

@@ -0,0 +1,54 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct packm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
};
typedef struct packm_thrinfo_s packm_thrinfo_t;
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
void bli_packm_thrinfo_free( packm_thrinfo_t* thread );
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread );

View File

@@ -56,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
void bli_packm_unb_var1( obj_t* c,
obj_t* p )
obj_t* p,
packm_thrinfo_t* thread )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -98,20 +99,22 @@ void bli_packm_unb_var1( obj_t* c,
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
densify,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p );
if( thread_am_ochief( thread ) ) {
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
densify,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p );
}
}

View File

@@ -33,7 +33,8 @@
*/
void bli_packm_unb_var1( obj_t* c,
obj_t* p );
obj_t* p,
packm_thrinfo_t* thread );
#undef GENTPROT

View File

@@ -49,7 +49,8 @@ static FUNCPTR_T vars[2][3] =
void bli_unpackm_int( obj_t* p,
obj_t* a,
unpackm_t* cntl )
unpackm_t* cntl,
packm_thrinfo_t* thread )
{
// The unpackm operation consists of an optional post-process: castm.
// (This post-process is analogous to the castm pre-process in packm.)
@@ -122,9 +123,12 @@ void bli_unpackm_int( obj_t* p,
f = vars[n][i];
// Invoke the variant.
f( p,
&c,
cntl );
if( thread_am_ochief( thread ) ) {
f( p,
&c,
cntl );
}
thread_obarrier( thread );
// Now, if necessary, we cast the contents of c to matrix a. If casting
// was not necessary, then we are done because the call to the unpackm

View File

@@ -34,7 +34,8 @@
void bli_unpackm_int( obj_t* p,
obj_t* a,
unpackm_t* cntl );
unpackm_t* cntl,
packm_thrinfo_t* thread );
/*
void bli_unpackm_init_cast( obj_t* p,

View File

@@ -76,7 +76,8 @@ void bli_gemv_blk_var1( obj_t* alpha,
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntl_sub_packv_y( cntl ) );

View File

@@ -81,7 +81,8 @@ void bli_gemv_blk_var2( obj_t* alpha,
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x( cntl ) );

View File

@@ -75,7 +75,8 @@ void bli_ger_blk_var1( obj_t* alpha,
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x( cntl ) );
@@ -90,7 +91,8 @@ void bli_ger_blk_var1( obj_t* alpha,
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntl_sub_unpackm_a( cntl ) );
cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -75,7 +75,8 @@ void bli_ger_blk_var2( obj_t* alpha,
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntl_sub_packv_y( cntl ) );
@@ -90,7 +91,8 @@ void bli_ger_blk_var2( obj_t* alpha,
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntl_sub_unpackm_a( cntl ) );
cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -106,7 +106,8 @@ void bli_hemv_blk_var1( conj_t conjh,
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,

View File

@@ -109,7 +109,8 @@ void bli_hemv_blk_var2( conj_t conjh,
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,

View File

@@ -106,7 +106,8 @@ void bli_hemv_blk_var3( conj_t conjh,
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,

View File

@@ -109,7 +109,8 @@ void bli_hemv_blk_var4( conj_t conjh,
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,

View File

@@ -90,7 +90,8 @@ void bli_her_blk_var1( conj_t conjh,
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
@@ -112,7 +113,8 @@ void bli_her_blk_var1( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -90,7 +90,8 @@ void bli_her_blk_var2( conj_t conjh,
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
@@ -112,7 +113,8 @@ void bli_her_blk_var2( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -101,7 +101,8 @@ void bli_her2_blk_var1( conj_t conjh,
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
@@ -136,7 +137,8 @@ void bli_her2_blk_var1( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -104,7 +104,8 @@ void bli_her2_blk_var2( conj_t conjh,
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
@@ -139,7 +140,8 @@ void bli_her2_blk_var2( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -104,7 +104,8 @@ void bli_her2_blk_var3( conj_t conjh,
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
@@ -139,7 +140,8 @@ void bli_her2_blk_var3( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -101,7 +101,8 @@ void bli_her2_blk_var4( conj_t conjh,
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntl_sub_packm_c11( cntl ) );
cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
@@ -136,7 +137,8 @@ void bli_her2_blk_var4( conj_t conjh,
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntl_sub_unpackm_c11( cntl ) );
cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back

View File

@@ -80,7 +80,8 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -80,7 +80,8 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -80,7 +80,8 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -80,7 +80,8 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -85,7 +85,8 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -85,7 +85,8 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -85,7 +85,8 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -85,7 +85,8 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntl_sub_packm_a11( cntl ) );
cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntl_sub_packv_x1( cntl ) );

View File

@@ -50,7 +50,6 @@
#include "bli_gemm4m.h"
#include "bli_gemm3m.h"
//
// Prototype object-based interface.
//

View File

@@ -37,45 +37,64 @@
void bli_gemm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b_pack;
obj_t c1, c1_pack;
//The s is for "lives on the stack"
obj_t b_pack_s;
obj_t a1_pack_s, c1_pack_s;
obj_t a1, c1;
obj_t* a1_pack = NULL;
obj_t* b_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t m_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
// Initialize objects passed into bli_packm_init for A and C
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing B.
bli_packm_init( b, &b_pack,
cntl_sub_packm_b( cntl ) );
// Pack B (if instructed).
bli_packm_int( b, &b_pack,
cntl_sub_packm_b( cntl ) );
dim_t start, end;
bli_get_range( thread, 0, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of a (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -83,38 +102,50 @@ void bli_gemm_blk_var1f( obj_t* a,
i, b_alg, a, &a1 );
bli_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
&a1_pack,
&b_pack,
a1_pack,
b_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_gemm( cntl ) );
c1_pack,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Currently must be done by 1 thread
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( b_pack );
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_gemm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );

View File

@@ -37,45 +37,63 @@
void bli_gemm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ) );
dim_t start, end;
bli_get_range( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
&start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
// NOTE: Use of b (for execution datatype) is intentional!
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, n_trans, b,
b_alg = bli_determine_blocksize_f( i, end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.
@@ -85,36 +103,48 @@ void bli_gemm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_gemm( cntl ) );
c1_pack,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_gemm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );

View File

@@ -37,37 +37,50 @@
void bli_gemm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
gemm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -85,26 +98,32 @@ void bli_gemm_blk_var3f( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
gemm_thread_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_gemm( cntl ) );
c_pack,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
@@ -112,17 +131,25 @@ void bli_gemm_blk_var3f( obj_t* a,
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 ) bli_obj_scalar_reset( &c_pack );
if ( i == 0 ) thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}
thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( c_pack );
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_gemm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );

View File

@@ -55,7 +55,6 @@ gemm_t* gemm_cntl_vl_mm;
gemm_t* gemm_cntl;
void bli_gemm_cntl_init()
{
// Create blocksize objects for each dimension.

View File

@@ -74,12 +74,20 @@ void bli_gemm_front( obj_t* alpha,
bli_obj_induce_trans( c_local );
}
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_gemm_int( alpha,
&a_local,
&b_local,
beta,
&c_local,
cntl );
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
alpha,
&a_local,
&b_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -39,7 +39,8 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );
static FUNCPTR_T vars[6][3] =
{
@@ -57,7 +58,8 @@ void bli_gemm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
obj_t a_local;
obj_t b_local;
@@ -73,7 +75,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -86,7 +90,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_is_zeros( *a ) ||
bli_obj_is_zeros( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -104,22 +110,24 @@ void bli_gemm_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
//if( thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
// }
}
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Extract the variant number and implementation type.
@@ -133,6 +141,7 @@ void bli_gemm_int( obj_t* alpha,
f( &a_local,
&b_local,
&c_local,
cntl );
cntl,
thread );
}

View File

@@ -37,5 +37,6 @@ void bli_gemm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );

View File

@@ -45,7 +45,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
gemm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
@@ -54,7 +55,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
void bli_gemm_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -117,7 +119,8 @@ void bli_gemm_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -133,7 +136,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
gemm_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -213,18 +217,21 @@ void PASTEMAC(ch,varname)( \
bli_auxinfo_set_ps_a( ps_a, aux ); \
bli_auxinfo_set_ps_b( ps_b, aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
@@ -232,18 +239,21 @@ void PASTEMAC(ch,varname)( \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = gemm_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = gemm_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -282,13 +292,7 @@ void PASTEMAC(ch,varname)( \
beta_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \

View File

@@ -39,7 +39,8 @@
void bli_gemm_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );
//
@@ -57,7 +58,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
gemm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( gemm_ker_var2 )

View File

@@ -54,7 +54,8 @@ static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5);
void bli_gemm_ker_var5( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl )
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );

View File

@@ -39,7 +39,8 @@
void bli_gemm_ker_var5( obj_t* a,
obj_t* b,
obj_t* c,
gemm_t* cntl );
gemm_t* cntl,
gemm_thrinfo_t* thread );
//

View File

@@ -0,0 +1,203 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_gemm = sub_gemm;
}
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_gemm = thread;
}
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm )
{
gemm_thrinfo_t* thread = ( gemm_thrinfo_t* ) bli_malloc( sizeof( gemm_thrinfo_t ) );
bli_setup_gemm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_gemm );
return thread;
}
void bli_gemm_thrinfo_free( gemm_thrinfo_t* thread)
{
if( thread == NULL || thread == &BLIS_GEMM_SINGLE_THREADED ) return;
// Free Communicators
if( thread_am_ochief( thread ) )
bli_free_communicator( thread->ocomm );
// Free Sub Thrinfos
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_gemm_thrinfo_free( thread->sub_gemm );
bli_free( thread );
return;
}
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t** threads, dim_t num )
{
for( int i = 0; i < num; i++)
bli_gemm_thrinfo_free( threads[i] );
bli_free( threads );
}
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( )
{
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t kc_way = 1;
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
#else
dim_t jc_way = 1;
dim_t kc_way = 1;
dim_t ic_way = 1;
dim_t jr_way = 1;
dim_t ir_way = 1;
#endif
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
gemm_thrinfo_t** paths = (gemm_thrinfo_t**) malloc( global_num_threads * sizeof( gemm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
gemm_thrinfo_t* ir_info = bli_create_gemm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
gemm_thrinfo_t* jr_info = bli_create_gemm_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
gemm_thrinfo_t* ic_info = bli_create_gemm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
gemm_thrinfo_t* kc_info = bli_create_gemm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
gemm_thrinfo_t* jc_info = bli_create_gemm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}

View File

@@ -0,0 +1,78 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct gemm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct gemm_thrinfo_s* sub_gemm;
};
typedef struct gemm_thrinfo_s gemm_thrinfo_t;
#define gemm_thread_sub_gemm( thread ) thread->sub_gemm
#define gemm_thread_sub_opackm( thread ) thread->opackm
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
// For use in gemm micro-kernel
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads );
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm );
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm );
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread );

View File

@@ -80,12 +80,20 @@ void bli_hemm_front( side_t side,
bli_obj_swap( a_local, b_local );
}
// Invoke the internal back-end.
bli_gemm_int( alpha,
&a_local,
&b_local,
beta,
&c_local,
cntl );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
alpha,
&a_local,
&b_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -109,20 +109,34 @@ void bli_her2k_front( obj_t* alpha,
&c_local,
cntl );
#else
// Invoke herk twice, using beta only the first time.
bli_herk_int( alpha,
&a_local,
&bh_local,
beta,
&c_local,
cntl );
bli_herk_int( &alpha_conj,
&b_local,
&ah_local,
&BLIS_ONE,
&c_local,
cntl );
// Invoke herk twice, using beta only the first time.
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&bh_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
&alpha_conj,
&b_local,
&ah_local,
&BLIS_ONE,
&c_local,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
}

View File

@@ -37,42 +37,60 @@
void bli_herk_blk_var1f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t ah_pack;
obj_t c1, c1_pack;
obj_t ah_pack_s;
obj_t a1_pack_s, c1_pack_s;
obj_t a1, c1;
obj_t* a1_pack;
obj_t* c1_pack;
obj_t* ah_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &ah_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A'.
bli_obj_init_pack( &ah_pack_s );
bli_packm_init( ah, &ah_pack_s,
cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
ah_pack = thread_obroadcast( thread, &ah_pack_s );
// Initialize pack objects that are passed into packm_init() for A and C.
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A' (if instructed).
bli_packm_int( ah, ah_pack,
cntl_sub_packm_b( cntl ),
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( *c );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A'.
bli_packm_init( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
// Pack A' (if instructed).
bli_packm_int( ah, &ah_pack,
cntl_sub_packm_b( cntl ) );
dim_t start, end;
bli_get_range_weighted( thread, 0, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -82,36 +100,47 @@ void bli_herk_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
herk_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
&a1_pack,
&ah_pack,
a1_pack,
ah_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_herk( cntl ) );
c1_pack,
cntl_sub_herk( cntl ),
herk_thread_sub_herk( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &ah_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( ah_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_herk_blk_var1f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );

View File

@@ -37,49 +37,68 @@
void bli_herk_blk_var2f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
obj_t a_pack, aS_pack;
obj_t ah1, ah1_pack;
obj_t c1;
obj_t c1S, c1S_pack;
obj_t a_pack_s;
obj_t ah1_pack_s, c1S_pack_s;
obj_t ah1, c1, c1S;
obj_t aS_pack;
obj_t* a_pack;
obj_t* ah1_pack;
obj_t* c1S_pack;
dim_t i;
dim_t b_alg;
dim_t n_trans;
subpart_t stored_part;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &ah1_pack );
bli_obj_init_pack( &c1S_pack );
// The upper and lower variants are identical, except for which
// merged subpartition is acquired in the loop body.
if ( bli_obj_is_lower( *c ) ) stored_part = BLIS_SUBPART1B;
else stored_part = BLIS_SUBPART1T;
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *c );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Initialize pack objects for C and A' that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &ah1_pack_s );
bli_obj_init_pack( &c1S_pack_s );
}
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
c1S_pack = thread_ibroadcast( thread, &c1S_pack_s );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *c );
dim_t start, end;
// Needs to be replaced with a weighted range because triangle
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_lower( *c ), &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1' and C1.
@@ -90,42 +109,53 @@ void bli_herk_blk_var2f( obj_t* a,
// Partition off the stored region of C1 and the corresponding region
// of A_pack.
bli_acquire_mpart_t2b( stored_part,
i, b_alg, &c1, &c1S );
bli_acquire_mpart_t2b( stored_part,
i, b_alg, &a_pack, &aS_pack );
bli_acquire_mpart_t2b( stored_part,
i, b_alg, &c1, &c1S );
bli_acquire_mpart_t2b( stored_part,
i, b_alg, a_pack, &aS_pack );
// Initialize objects for packing A1' and C1.
bli_packm_init( &ah1, &ah1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1S, &c1S_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &ah1, ah1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1S, c1S_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread ) ;
// Pack A1' (if instructed).
bli_packm_int( &ah1, &ah1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &ah1, ah1_pack,
cntl_sub_packm_b( cntl ),
herk_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1S, &c1S_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1S, c1S_pack,
cntl_sub_packm_c( cntl ),
herk_thread_sub_ipackm( thread ) ) ;
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
&aS_pack,
&ah1_pack,
ah1_pack,
&BLIS_ONE,
&c1S_pack,
cntl_sub_herk( cntl ) );
c1S_pack,
cntl_sub_herk( cntl ),
herk_thread_sub_herk( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1S_pack, &c1S,
cntl_sub_unpackm_c( cntl ) );
bli_unpackm_int( c1S_pack, &c1S,
cntl_sub_unpackm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &ah1_pack );
bli_obj_release_pack( &c1S_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( ah1_pack );
bli_obj_release_pack( c1S_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_herk_blk_var2f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );

View File

@@ -37,37 +37,50 @@
void bli_herk_blk_var3f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t ah1, ah1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, ah1_pack_s;
obj_t a1, ah1;
obj_t* a1_pack = NULL;
obj_t* ah1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
if( thread_am_ochief( thread ) ) {
// Initialize object for packing C.
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &ah1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &ah1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
herk_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -82,44 +95,59 @@ void bli_herk_blk_var3f( obj_t* a,
i, b_alg, ah, &ah1 );
// Initialize objects for packing A1 and A1'.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &ah1, &ah1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &ah1, ah1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
herk_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &ah1, &ah1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &ah1, ah1_pack,
cntl_sub_packm_b( cntl ),
herk_thread_sub_ipackm( thread ) );
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
&a1_pack,
&ah1_pack,
a1_pack,
ah1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_herk( cntl ) );
c_pack,
cntl_sub_herk( cntl ),
herk_thread_sub_herk( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
// only for the first iteration (and then BLIS_ONE for all others).
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 ) thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
// only for the first iteration (and then BLIS_ONE for all others).
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
if ( i == 0 ) bli_obj_scalar_reset( &c_pack );
}
thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ),
herk_thread_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &ah1_pack );
bli_obj_release_pack( &c_pack );
if( thread_am_ochief( thread ) ) {
bli_obj_release_pack( c_pack );
}
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( ah1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_herk_blk_var3f( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );

View File

@@ -77,12 +77,20 @@ void bli_herk_front( obj_t* alpha,
bli_obj_induce_trans( c_local );
}
// Invoke the internal back-end.
bli_herk_int( alpha,
&a_local,
&ah_local,
beta,
&c_local,
cntl );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&ah_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -39,7 +39,8 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* ah,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );
static FUNCPTR_T vars[2][4][3] =
{
@@ -66,7 +67,8 @@ void bli_herk_int( obj_t* alpha,
obj_t* ah,
obj_t* beta,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
obj_t a_local;
obj_t ah_local;
@@ -83,7 +85,9 @@ void bli_herk_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *ah ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -105,22 +109,22 @@ void bli_herk_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
// If alpha is non-unit, typecast and apply it to the scalar
// attached to A'.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &ah_local );
bli_obj_scalar_apply_scalar( alpha, &ah_local );
}
// If beta is non-unit, typecast and apply it to the scalar
// attached to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set a bool based on the uplo field of C's root object.
@@ -138,6 +142,7 @@ void bli_herk_int( obj_t* alpha,
f( &a_local,
&ah_local,
&c_local,
cntl );
cntl,
thread );
}

View File

@@ -37,5 +37,6 @@ void bli_herk_int( obj_t* alpha,
obj_t* ah,
obj_t* beta,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
herk_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
void bli_herk_l_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -121,7 +123,8 @@ void bli_herk_l_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
herk_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
@@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
@@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}

View File

@@ -39,7 +39,8 @@
void bli_herk_l_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
herk_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( herk_l_ker_var2 )

View File

@@ -0,0 +1,203 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_herk = sub_herk;
}
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_herk = thread;
}
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk )
{
herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc( sizeof( herk_thrinfo_t ) );
bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_herk );
return thread;
}
void bli_herk_thrinfo_free( herk_thrinfo_t* thread)
{
if( thread == NULL ) return;
// Free Communicators
if( thread_am_ochief( thread ) )
bli_free_communicator( thread->ocomm );
// Free Sub Thrinfos
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_herk_thrinfo_free( thread->sub_herk );
bli_free( thread );
return;
}
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num )
{
for( int i = 0; i < num; i++)
bli_herk_thrinfo_free( threads[i] );
bli_free( threads );
}
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
{
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t kc_way = 1;
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
#else
dim_t jc_way = 1;
dim_t kc_way = 1;
dim_t ic_way = 1;
dim_t jr_way = 1;
dim_t ir_way = 1;
#endif
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
herk_thrinfo_t** paths = (herk_thrinfo_t**) malloc( global_num_threads * sizeof( herk_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}

View File

@@ -0,0 +1,79 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct herk_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct herk_thrinfo_s* sub_herk;
};
typedef struct herk_thrinfo_s herk_thrinfo_t;
#define herk_thread_sub_herk( thread ) thread->sub_herk
#define herk_thread_sub_opackm( thread ) thread->opackm
#define herk_thread_sub_ipackm( thread ) thread->ipackm
// For use in herk micro-kernel
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads );
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk );
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk );
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread );

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
herk_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
void bli_herk_u_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
herk_t* cntl )
herk_t* cntl,
herk_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -121,7 +123,8 @@ void bli_herk_u_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -138,7 +141,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
herk_thrinfo_t* thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -246,16 +250,22 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
@@ -263,9 +273,12 @@ void PASTEMAC(ch,varname)( \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
@@ -273,11 +286,11 @@ void PASTEMAC(ch,varname)( \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = herk_get_next_a_micropanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = herk_get_next_b_micropanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -344,13 +357,7 @@ void PASTEMAC(ch,varname)( \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
}

View File

@@ -39,7 +39,8 @@
void bli_herk_u_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
herk_t* cntl );
herk_t* cntl,
herk_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
herk_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( herk_u_ker_var2 )

View File

@@ -79,12 +79,20 @@ void bli_symm_front( side_t side,
bli_obj_swap( a_local, b_local );
}
// Invoke the internal back-end.
bli_gemm_int( alpha,
&a_local,
&b_local,
beta,
&c_local,
cntl );
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_gemm_int,
alpha,
&a_local,
&b_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -93,19 +93,31 @@ void bli_syr2k_front( obj_t* alpha,
cntl );
#else
// Invoke herk twice, using beta only the first time.
bli_herk_int( alpha,
&a_local,
&bt_local,
beta,
&c_local,
cntl );
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
bli_herk_int( alpha,
&b_local,
&at_local,
&BLIS_ONE,
&c_local,
cntl );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&bt_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
alpha,
&b_local,
&at_local,
&BLIS_ONE,
&c_local,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
#endif
}

View File

@@ -72,13 +72,21 @@ void bli_syrk_front( obj_t* alpha,
{
bli_obj_induce_trans( c_local );
}
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_herk_int( alpha,
&a_local,
&at_local,
beta,
&c_local,
cntl );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_herk_int,
alpha,
&a_local,
&at_local,
beta,
&c_local,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -37,21 +37,48 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b_pack;
obj_t c1, c1_pack;
obj_t b_pack_s;
obj_t a1_pack_s, c1_pack_s;
obj_t a1, c1;
obj_t* a1_pack = NULL;
obj_t* b_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t m_trans;
dim_t offA;
if( thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
m_trans = bli_obj_length_after_trans( *a );
@@ -66,24 +93,16 @@ void bli_trmm_blk_var1f( obj_t* a,
m_trans = bli_abs( bli_obj_diag_offset_after_trans( *a ) ) +
bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing B.
bli_packm_init( b, &b_pack,
cntl_sub_packm_b( cntl ) );
// Pack B (if instructed).
bli_packm_int( b, &b_pack,
cntl_sub_packm_b( cntl ) );
dim_t start, end;
bli_get_range_weighted( thread, offA, m_trans,
bli_determine_reg_blocksize( a, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the m dimension.
for ( i = offA; i < m_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
b_alg = bli_determine_blocksize_f( i, end, a,
cntl_blocksize( cntl ) );
// Acquire partitions for A1 and C1.
@@ -93,36 +112,47 @@ void bli_trmm_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b_pack,
a1_pack,
b_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( b_pack );
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var1f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,42 +37,60 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread)
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ) );
dim_t start, end;
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_upper( *c ), &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, n_trans, b,
b_alg = bli_determine_blocksize_b( i, end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.
@@ -82,36 +100,47 @@ void bli_trmm_blk_var2b( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var2b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,42 +37,60 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread)
{
obj_t a_pack;
obj_t b1, b1_pack;
obj_t c1, c1_pack;
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
obj_t b1, c1;
obj_t* a_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c1_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c1_pack );
if( thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( *b );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing A.
bli_packm_init( a, &a_pack,
cntl_sub_packm_a( cntl ) );
// Pack A (if instructed).
bli_packm_int( a, &a_pack,
cntl_sub_packm_a( cntl ) );
dim_t start, end;
bli_get_range_weighted( thread, 0, n_trans,
bli_determine_reg_blocksize( b, cntl_blocksize( cntl ) ),
bli_obj_is_lower( *c ), &start, &end );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
for ( i = start; i < end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, b,
b_alg = bli_determine_blocksize_f( i, end, b,
cntl_blocksize( cntl ) );
// Acquire partitions for B1 and C1.
@@ -82,36 +100,47 @@ void bli_trmm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, &c1_pack,
cntl_sub_packm_c( cntl ) );
bli_packm_int( &c1, c1_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a_pack,
&b1_pack,
a_pack,
b1_pack,
&BLIS_ONE,
&c1_pack,
cntl_sub_trmm( cntl ) );
c1_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( &c1_pack, &c1,
cntl_sub_unpackm_c( cntl ) );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c1_pack );
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_obj_release_pack( a_pack );
if( thread_am_ichief( thread ) ) {
bli_obj_release_pack( b1_pack );
bli_obj_release_pack( c1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var2f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,37 +37,50 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -82,36 +95,49 @@ void bli_trmm_blk_var3b( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trmm( cntl ) );
c_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
}
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
thread_obarrier( thread );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
// Unpack C (if C was packed).
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ochief( thread ) ){
bli_obj_release_pack( c_pack );
}
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var3b( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -37,37 +37,50 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a1, a1_pack;
obj_t b1, b1_pack;
obj_t c_pack;
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
obj_t a1, b1;
obj_t* a1_pack = NULL;
obj_t* b1_pack = NULL;
obj_t* c_pack = NULL;
dim_t i;
dim_t b_alg;
dim_t k_trans;
// Initialize all pack objects that are passed into packm_init().
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &b1_pack );
bli_obj_init_pack( &c_pack );
if( thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntl_sub_packm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntl_sub_scalm( cntl ) );
// Initialize object for packing C.
bli_packm_init( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Pack C (if instructed).
bli_packm_int( c, &c_pack,
cntl_sub_packm_c( cntl ) );
// Partition along the k dimension.
for ( i = 0; i < k_trans; i += b_alg )
{
@@ -82,36 +95,49 @@ void bli_trmm_blk_var3f( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
bli_packm_init( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, &a1_pack,
cntl_sub_packm_a( cntl ) );
bli_packm_int( &a1, a1_pack,
cntl_sub_packm_a( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, &b1_pack,
cntl_sub_packm_b( cntl ) );
bli_packm_int( &b1, b1_pack,
cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
&a1_pack,
&b1_pack,
a1_pack,
b1_pack,
&BLIS_ONE,
&c_pack,
cntl_sub_trmm( cntl ) );
c_pack,
cntl_sub_trmm( cntl ),
trmm_thread_sub_trmm( thread ) );
}
// Unpack C (if C was packed).
bli_unpackm_int( &c_pack, c,
cntl_sub_unpackm_c( cntl ) );
thread_obarrier( thread );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_obj_release_pack( &a1_pack );
bli_obj_release_pack( &b1_pack );
bli_obj_release_pack( &c_pack );
// Unpack C (if C was packed).
bli_unpackm_int( c_pack, c,
cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ochief( thread ) ){
bli_obj_release_pack( c_pack );
}
if( thread_am_ichief( thread ) ){
bli_obj_release_pack( a1_pack );
bli_obj_release_pack( b1_pack );
}
}

View File

@@ -35,5 +35,6 @@
void bli_trmm_blk_var3f( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -125,12 +125,20 @@ void bli_trmm_front( side_t side,
if ( bli_is_left( side ) ) cntl = l_cntl;
else cntl = r_cntl;
// Invoke the internal back-end.
bli_trmm_int( alpha,
&a_local,
&b_local,
&BLIS_ZERO,
&c_local,
cntl );
trmm_thrinfo_t** infos = bli_create_trmm_thrinfo_paths( !bli_is_left( side ) );
dim_t n_threads = thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
(level3_int_t) bli_trmm_int,
alpha,
&a_local,
&b_local,
&BLIS_ZERO,
&c_local,
(void*) cntl,
(void**) infos );
bli_trmm_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -39,7 +39,8 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
static FUNCPTR_T vars[2][2][4][3] =
{
@@ -88,7 +89,8 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
obj_t a_local;
obj_t b_local;
@@ -105,7 +107,9 @@ void bli_trmm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
bli_scalm( beta, c );
if( thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
return;
}
@@ -127,22 +131,22 @@ void bli_trmm_int( obj_t* alpha,
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
}
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set two bools: one based on the implied side parameter (the structure
@@ -173,6 +177,7 @@ void bli_trmm_int( obj_t* alpha,
f( &a_local,
&b_local,
&c_local,
cntl );
cntl,
thread );
}

View File

@@ -37,4 +37,5 @@ void bli_trmm_int( obj_t* alpha,
obj_t* b,
obj_t* beta,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_ll_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -271,9 +275,12 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
for ( j = 0; j < n_iter; ++j ) { \
\
if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
@@ -308,121 +315,124 @@ void PASTEMAC(ch,varname)( \
off_a1011 = 0; \
k_a1011 = diagoffa_i + MR; \
\
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1011 * ss_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( k_a1011 * ss_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
a1 += k_a1011 * ss_a; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
if( trmm_l_ir_my_iter( i, ir_thread ) ) \
{ \
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, aux ); \
bli_auxinfo_set_next_b( b2, aux ); \
\
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( rstep_a, aux ); \
/* Save the panel stride of the current panel of A to the
auxinfo_t object. */ \
bli_auxinfo_set_ps_a( rstep_a, aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
} \
b1 += cstep_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_ll_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_ll_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_lu_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -277,6 +281,8 @@ void PASTEMAC(ch,varname)( \
\
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
@@ -294,7 +300,7 @@ void PASTEMAC(ch,varname)( \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = 0; i < m_iter; ++i ) if( trmm_l_jr_my_iter( j, jr_thread ) ) { \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
@@ -315,14 +321,15 @@ void PASTEMAC(ch,varname)( \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
b1_i = b1 + off_a1112 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + k_a1112 * ss_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -369,19 +376,20 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += k_a1112 * ss_a; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
if( trmm_l_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -423,13 +431,13 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
\
} \
b1 += cstep_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_lu_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_lu_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_rl_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
/*inc_t cstep_b; */\
inc_t rstep_c, cstep_c; \
inc_t ss_b; \
auxinfo_t aux; \
@@ -267,7 +271,7 @@ void PASTEMAC(ch,varname)( \
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
/*cstep_b = ps_b; */\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
@@ -278,6 +282,7 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread );\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a1_i; \
ctype* restrict a2; \
\
@@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \
a1_i = a1 + off_b1121 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + k_b1121 * ss_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
@@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
} \
b1 += k_b1121 * ss_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_rl_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_rl_ker_var2 )

View File

@@ -46,7 +46,8 @@ typedef void (*FUNCPTR_T)(
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void* gemm_ukr
void* gemm_ukr,
trmm_thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
@@ -55,7 +56,8 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl )
trmm_t* cntl,
trmm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -131,7 +133,8 @@ void bli_trmm_ru_ker_var2( obj_t* a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
gemm_ukr,
thread );
}
@@ -148,7 +151,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* jr_thread \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
@@ -186,7 +190,7 @@ void PASTEMAC(ch,varname)( \
dim_t off_b0111; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
/*inc_t cstep_b; */\
inc_t rstep_c, cstep_c; \
inc_t ss_b; \
auxinfo_t aux; \
@@ -268,7 +272,7 @@ void PASTEMAC(ch,varname)( \
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
/*cstep_b = ps_b; */\
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
@@ -279,6 +283,7 @@ void PASTEMAC(ch,varname)( \
b1 = b_cast; \
c1 = c_cast; \
\
trmm_thrinfo_t* ir_thread = trmm_thread_sub_trmm( jr_thread ); \
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
@@ -296,6 +301,8 @@ void PASTEMAC(ch,varname)( \
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
if( trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
@@ -313,6 +320,7 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a1_i; \
ctype* restrict a2; \
\
@@ -321,11 +329,11 @@ void PASTEMAC(ch,varname)( \
a1_i = a1 + off_b0111 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + k_b0111 * ss_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -368,7 +376,7 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
@@ -378,16 +386,17 @@ void PASTEMAC(ch,varname)( \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( trmm_r_ir_my_iter( i, ir_thread ) ) { \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter ) ) \
b2 = b_cast; \
} \
@@ -425,12 +434,12 @@ void PASTEMAC(ch,varname)( \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
} \
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
} \
b1 += k_b0111 * ss_b; \
c1 += cstep_c; \
} \

View File

@@ -39,7 +39,8 @@
void bli_trmm_ru_ker_var2( obj_t* a,
obj_t* b,
obj_t* c,
trmm_t* cntl );
trmm_t* cntl,
trmm_thrinfo_t* thread );
//
@@ -58,7 +59,8 @@ void PASTEMAC(ch,varname)( \
void* b, inc_t rs_b, inc_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void* gemm_ukr \
void* gemm_ukr, \
trmm_thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( trmm_ru_ker_var2 )

View File

@@ -0,0 +1,207 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_trmm = sub_trmm;
}
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_trmm = thread;
}
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm )
{
trmm_thrinfo_t* thread = ( trmm_thrinfo_t* ) bli_malloc( sizeof( trmm_thrinfo_t ) );
bli_setup_trmm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_trmm );
return thread;
}
void bli_trmm_thrinfo_free( trmm_thrinfo_t* thread)
{
if( thread == NULL ) return;
// Free Communicators
if( thread_am_ochief( thread ) )
bli_free_communicator( thread->ocomm );
// Free Sub Thrinfos
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_trmm_thrinfo_free( thread->sub_trmm );
bli_free( thread );
return;
}
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** threads, dim_t num )
{
for( int i = 0; i < num; i++)
bli_trmm_thrinfo_free( threads[i] );
bli_free( threads );
}
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency )
{
#ifdef BLIS_ENABLE_MULTITHREADING
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
dim_t kc_way = 1;
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
if( jc_dependency ){
jr_way *= jc_way;
jc_way = 1;
}
#else
dim_t jc_way = 1;
dim_t kc_way = 1;
dim_t ic_way = 1;
dim_t jr_way = 1;
dim_t ir_way = 1;
#endif
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
trmm_thrinfo_t** paths = (trmm_thrinfo_t**) malloc( global_num_threads * sizeof( trmm_thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
trmm_thrinfo_t* ir_info = bli_create_trmm_thrinfo_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
trmm_thrinfo_t* jr_info = bli_create_trmm_thrinfo_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
packm_thrinfo_t* packb = bli_create_packm_thread_info( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* packa = bli_create_packm_thread_info( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
trmm_thrinfo_t* ic_info = bli_create_trmm_thrinfo_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
packb, packa, jr_info);
trmm_thrinfo_t* kc_info = bli_create_trmm_thrinfo_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
NULL, NULL, ic_info);
trmm_thrinfo_t* jc_info = bli_create_trmm_thrinfo_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
NULL, NULL, kc_info);
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}

View File

@@ -0,0 +1,79 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct trmm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct trmm_thrinfo_s* sub_trmm;
};
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
#define trmm_thread_sub_opackm( thread ) thread->opackm
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency );
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads );
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );

Some files were not shown because too many files have changed in this diff Show More