mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Reorganized code, APIs related to multithreading.
Details:
- Reorganized code and renamed files defining APIs related to multithreading.
All code that is not specific to a particular operation is now located in a
new directory: frame/thread. Code is now organized, roughly, by the
namespace to which it belongs (see below).
- Consolidated all operation-specific *_thrinfo_t object types into a single
thrinfo_t object type. Operation-specific level-3 *_thrinfo_t APIs were
also consolidated, leaving bli_l3_thrinfo_*() and bli_packm_thrinfo_*()
functions (aside from a few general purpose bli_thrinfo_*() functions).
- Renamed thread_comm_t object type to thrcomm_t.
- Renamed many of the routines and functions (and macros) for multithreading.
We now have the following API namespaces:
- bli_thrinfo_*(): functions related to thrinfo_t objects
- bli_thrcomm_*(): functions related to thrcomm_t objects.
- bli_thread_*(): general-purpose functions, such as initialization,
finalization, and computing ranges. (For now, some macros, such as
bli_thread_[io]broadcast() and bli_thread_[io]barrier() use the
bli_thread_ namespace prefix, even though bli_thrinfo_ may be more
appropriate.)
- Renamed thread-related macros so that they use a bli_ prefix.
- Renamed control tree-related macros so that they use a bli_ prefix (to be
consistent with the thread-related macros that were also renamed).
- Removed #undef BLIS_SIMD_ALIGN_SIZE from dunnington's bli_kernel.h. This
#undef was a temporary fix to some macro defaults which were being applied
in the wrong order, which was recently fixed.
This commit is contained in:
@@ -38,7 +38,6 @@
|
||||
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
#undef BLIS_SIMD_ALIGN_SIZE
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
// -- Cache blocksizes --
|
||||
|
||||
@@ -43,11 +43,11 @@ typedef struct packv_s packv_t;
|
||||
|
||||
#define cntl_bmid( cntl ) cntl->bmid
|
||||
|
||||
#define cntl_sub_packv( cntl ) cntl->sub_packv
|
||||
#define cntl_sub_packv_x( cntl ) cntl->sub_packv_x
|
||||
#define cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1
|
||||
#define cntl_sub_packv_y( cntl ) cntl->sub_packv_y
|
||||
#define cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1
|
||||
#define bli_cntl_sub_packv( cntl ) cntl->sub_packv
|
||||
#define bli_cntl_sub_packv_x( cntl ) cntl->sub_packv_x
|
||||
#define bli_cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1
|
||||
#define bli_cntl_sub_packv_y( cntl ) cntl->sub_packv_y
|
||||
#define bli_cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1
|
||||
|
||||
void bli_packv_cntl_init( void );
|
||||
void bli_packv_cntl_finalize( void );
|
||||
|
||||
@@ -58,7 +58,7 @@ void bli_packv_init
|
||||
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL, and if so, simply alias the object to its packed counterpart.
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
bli_obj_alias_to( *a, *p );
|
||||
return;
|
||||
@@ -217,7 +217,7 @@ void bli_packv_release
|
||||
packv_t* cntl
|
||||
)
|
||||
{
|
||||
if ( !cntl_is_noop( cntl ) )
|
||||
if ( !bli_cntl_is_noop( cntl ) )
|
||||
bli_obj_release_pack( p );
|
||||
}
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ void bli_packv_int( obj_t* a,
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL. We return without taking any action because a was already
|
||||
// aliased to p in packv_init().
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -114,8 +114,8 @@ void bli_packv_int( obj_t* a,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -39,7 +39,7 @@ struct scalv_s
|
||||
};
|
||||
typedef struct scalv_s scalv_t;
|
||||
|
||||
#define cntl_sub_scalv( cntl ) cntl->sub_scalv
|
||||
#define bli_cntl_sub_scalv( cntl ) cntl->sub_scalv
|
||||
|
||||
void bli_scalv_cntl_init( void );
|
||||
void bli_scalv_cntl_finalize( void );
|
||||
|
||||
@@ -61,14 +61,14 @@ void bli_scalv_int( obj_t* alpha,
|
||||
bli_scalv_check( alpha, x );
|
||||
|
||||
// First check if we are to skip this operation.
|
||||
if ( cntl_is_noop( cntl ) ) return;
|
||||
if ( bli_cntl_is_noop( cntl ) ) return;
|
||||
|
||||
// Return early if the alpha scalar equals one.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ONE ) ) return;
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -39,11 +39,11 @@ struct unpackv_s
|
||||
};
|
||||
typedef struct unpackv_s unpackv_t;
|
||||
|
||||
#define cntl_sub_unpackv( cntl ) cntl->sub_unpackv
|
||||
#define cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x
|
||||
#define cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1
|
||||
#define cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y
|
||||
#define cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1
|
||||
#define bli_cntl_sub_unpackv( cntl ) cntl->sub_unpackv
|
||||
#define bli_cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x
|
||||
#define bli_cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1
|
||||
#define bli_cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y
|
||||
#define bli_cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1
|
||||
|
||||
void bli_unpackv_cntl_init( void );
|
||||
void bli_unpackv_cntl_finalize( void );
|
||||
|
||||
@@ -79,7 +79,7 @@ void bli_unpackv_int( obj_t* p,
|
||||
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL, and if so, simply return.
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -116,8 +116,8 @@ void bli_unpackv_int( obj_t* p,
|
||||
// Now we are ready to proceed with the unpacking.
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -57,7 +57,7 @@ typedef void (*FUNCPTR_T)(
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void* packm_ker,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
@@ -96,7 +96,7 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* t )
|
||||
thrinfo_t* t )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -156,7 +156,7 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if( thread_am_ochief( t ) )
|
||||
if( bli_thread_am_ochief( t ) )
|
||||
{
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
@@ -177,7 +177,7 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
}
|
||||
kappa_p = thread_obroadcast( t, kappa_p );
|
||||
kappa_p = bli_thread_obroadcast( t, kappa_p );
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
@@ -280,7 +280,7 @@ void PASTEMAC(ch,varname) \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
packm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH2(ch,opname,_ft) packm_ker_cast = packm_ker; \
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
void bli_packm_blk_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* t );
|
||||
thrinfo_t* t );
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
@@ -63,7 +63,7 @@ void PASTEMAC(ch,varname) \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void* packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
packm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_blk_var1 )
|
||||
|
||||
@@ -55,13 +55,13 @@ typedef struct packm_s packm_t;
|
||||
#define cntl_pack_schema( cntl ) cntl->pack_schema
|
||||
#define cntl_pack_buf_type( cntl ) cntl->pack_buf_type
|
||||
|
||||
#define cntl_sub_packm( cntl ) cntl->sub_packm
|
||||
#define cntl_sub_packm_a( cntl ) cntl->sub_packm_a
|
||||
#define cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11
|
||||
#define cntl_sub_packm_b( cntl ) cntl->sub_packm_b
|
||||
#define cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11
|
||||
#define cntl_sub_packm_c( cntl ) cntl->sub_packm_c
|
||||
#define cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11
|
||||
#define bli_cntl_sub_packm( cntl ) cntl->sub_packm
|
||||
#define bli_cntl_sub_packm_a( cntl ) cntl->sub_packm_a
|
||||
#define bli_cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11
|
||||
#define bli_cntl_sub_packm_b( cntl ) cntl->sub_packm_b
|
||||
#define bli_cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11
|
||||
#define bli_cntl_sub_packm_c( cntl ) cntl->sub_packm_c
|
||||
#define bli_cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11
|
||||
|
||||
void bli_packm_cntl_init( void );
|
||||
void bli_packm_cntl_finalize( void );
|
||||
|
||||
@@ -60,7 +60,7 @@ void bli_packm_init( obj_t* a,
|
||||
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL, and if so, simply alias the object to its packed counterpart.
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
bli_obj_alias_to( *a, *p );
|
||||
return;
|
||||
@@ -581,7 +581,7 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
void bli_packm_release( obj_t* p,
|
||||
packm_t* cntl )
|
||||
{
|
||||
if ( !cntl_is_noop( cntl ) )
|
||||
if ( !bli_cntl_is_noop( cntl ) )
|
||||
bli_obj_release_pack( p );
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* t );
|
||||
thrinfo_t* t );
|
||||
|
||||
static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
@@ -56,7 +56,7 @@ void bli_packm_int( obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_t* cntl,
|
||||
packm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
varnum_t n;
|
||||
impl_t i;
|
||||
@@ -73,7 +73,7 @@ void bli_packm_int( obj_t* a,
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL. We return without taking any action because a was already
|
||||
// aliased to p in packm_init().
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -115,8 +115,8 @@ void bli_packm_int( obj_t* a,
|
||||
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
@@ -128,6 +128,6 @@ void bli_packm_int( obj_t* a,
|
||||
thread );
|
||||
|
||||
// Barrier so that packing is done before computation
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -36,5 +36,5 @@ void bli_packm_int( obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_t* cntl,
|
||||
packm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
111
frame/1m/packm/bli_packm_thrinfo.c
Normal file
111
frame/1m/packm/bli_packm_thrinfo.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
thrinfo_t* bli_packm_thrinfo_create
|
||||
(
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id
|
||||
)
|
||||
{
|
||||
thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) );
|
||||
|
||||
bli_thrinfo_init
|
||||
(
|
||||
thread,
|
||||
ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way,
|
||||
work_id,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_packm_thrinfo_init
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id
|
||||
)
|
||||
{
|
||||
bli_thrinfo_init
|
||||
(
|
||||
thread,
|
||||
ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
|
||||
void bli_packm_thrinfo_init_single
|
||||
(
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bli_packm_thrinfo_init
|
||||
(
|
||||
thread,
|
||||
&BLIS_SINGLE_COMM, 0,
|
||||
&BLIS_SINGLE_COMM, 0,
|
||||
1,
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
void bli_packm_thrinfo_free
|
||||
(
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
if ( thread != NULL &&
|
||||
thread != &BLIS_PACKM_SINGLE_THREADED )
|
||||
bli_free_intl( thread );
|
||||
}
|
||||
|
||||
@@ -32,23 +32,44 @@
|
||||
|
||||
*/
|
||||
|
||||
struct packm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
};
|
||||
typedef struct packm_thrinfo_s packm_thrinfo_t;
|
||||
//
|
||||
// thrinfo_t macros specific to packm.
|
||||
//
|
||||
|
||||
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
void bli_packm_thrinfo_free( packm_thrinfo_t* thread );
|
||||
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id );
|
||||
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread );
|
||||
//
|
||||
// thrinfo_t APIs specific to packm.
|
||||
//
|
||||
|
||||
thrinfo_t* bli_packm_thrinfo_create
|
||||
(
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id
|
||||
);
|
||||
|
||||
void bli_packm_thrinfo_init
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id
|
||||
);
|
||||
|
||||
void bli_packm_thrinfo_init_single
|
||||
(
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_packm_thrinfo_free
|
||||
(
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -58,7 +58,7 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
|
||||
void bli_packm_unb_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
@@ -96,7 +96,7 @@ void bli_packm_unb_var1( obj_t* c,
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
void bli_packm_unb_var1( obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
packm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
|
||||
@@ -39,7 +39,7 @@ struct scalm_s
|
||||
};
|
||||
typedef struct scalm_s scalm_t;
|
||||
|
||||
#define cntl_sub_scalm( cntl ) cntl->sub_scalm
|
||||
#define bli_cntl_sub_scalm( cntl ) cntl->sub_scalm
|
||||
|
||||
void bli_scalm_cntl_init( void );
|
||||
void bli_scalm_cntl_finalize( void );
|
||||
|
||||
@@ -64,7 +64,7 @@ void bli_scalm_int( obj_t* alpha,
|
||||
bli_scalm_check( alpha, x );
|
||||
|
||||
// First check if we are to skip this operation.
|
||||
if ( cntl_is_noop( cntl ) ) return;
|
||||
if ( bli_cntl_is_noop( cntl ) ) return;
|
||||
|
||||
// Return early if both alpha and the scalar attached to x are unit.
|
||||
if ( bli_obj_equals( alpha, &BLIS_ONE ) &&
|
||||
@@ -85,8 +85,8 @@ void bli_scalm_int( obj_t* alpha,
|
||||
//}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -40,13 +40,13 @@ struct unpackm_s
|
||||
};
|
||||
typedef struct unpackm_s unpackm_t;
|
||||
|
||||
#define cntl_sub_unpackm( cntl ) cntl->sub_unpackm
|
||||
#define cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a
|
||||
#define cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11
|
||||
#define cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b
|
||||
#define cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11
|
||||
#define cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c
|
||||
#define cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11
|
||||
#define bli_cntl_sub_unpackm( cntl ) cntl->sub_unpackm
|
||||
#define bli_cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a
|
||||
#define bli_cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11
|
||||
#define bli_cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b
|
||||
#define bli_cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11
|
||||
#define bli_cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c
|
||||
#define bli_cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11
|
||||
|
||||
void bli_unpackm_cntl_init( void );
|
||||
void bli_unpackm_cntl_finalize( void );
|
||||
|
||||
@@ -52,7 +52,7 @@ void bli_unpackm_int( obj_t* p,
|
||||
obj_t* a,
|
||||
cntx_t* cntx,
|
||||
unpackm_t* cntl,
|
||||
packm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
// The unpackm operation consists of an optional post-process: castm.
|
||||
// (This post-process is analogous to the castm pre-process in packm.)
|
||||
@@ -77,7 +77,7 @@ void bli_unpackm_int( obj_t* p,
|
||||
|
||||
// First check if we are to skip this operation because the control tree
|
||||
// is NULL, and if so, simply return.
|
||||
if ( cntl_is_noop( cntl ) )
|
||||
if ( bli_cntl_is_noop( cntl ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -118,20 +118,20 @@ void bli_unpackm_int( obj_t* p,
|
||||
// Now we are ready to proceed with the unpacking.
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
// Invoke the variant.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
f( p,
|
||||
&c,
|
||||
cntx,
|
||||
cntl );
|
||||
}
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
|
||||
// Now, if necessary, we cast the contents of c to matrix a. If casting
|
||||
// was not necessary, then we are done because the call to the unpackm
|
||||
|
||||
@@ -36,7 +36,7 @@ void bli_unpackm_int( obj_t* p,
|
||||
obj_t* a,
|
||||
cntx_t* cntx,
|
||||
unpackm_t* cntl,
|
||||
packm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
/*
|
||||
void bli_unpackm_init_cast( obj_t* p,
|
||||
|
||||
@@ -61,7 +61,7 @@ void bli_gemv_blk_var1( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and y1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -71,16 +71,16 @@ void bli_gemv_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A1 and y1 (if needed).
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y( cntl ) );
|
||||
|
||||
// Copy/pack A1, y1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y( cntl ) );
|
||||
|
||||
// y1 = beta * y1 + alpha * A1 * x;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -91,16 +91,16 @@ void bli_gemv_blk_var1( obj_t* alpha,
|
||||
beta,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv( cntl ) );
|
||||
bli_cntl_sub_gemv( cntl ) );
|
||||
|
||||
// Copy/unpack y1 (if y1 was packed).
|
||||
bli_unpackv_int( &y1_pack, &y1,
|
||||
cntx, cntl_sub_unpackv_y( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_y( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y( cntl ) );
|
||||
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -59,14 +59,14 @@ void bli_gemv_blk_var2( obj_t* alpha,
|
||||
// y = beta * y;
|
||||
bli_scalv_int( beta,
|
||||
y,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition along the "k" dimension (n dimension of A).
|
||||
for ( i = 0; i < n_trans; i += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and x1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -76,16 +76,16 @@ void bli_gemv_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A1 and x1 (if needed).
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x( cntl ) );
|
||||
|
||||
// Copy/pack A1, x1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x( cntl ) );
|
||||
|
||||
// y = y + alpha * A1 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -96,12 +96,12 @@ void bli_gemv_blk_var2( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
y,
|
||||
cntx,
|
||||
cntl_sub_gemv( cntl ) );
|
||||
bli_cntl_sub_gemv( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x( cntl ) );
|
||||
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -46,13 +46,13 @@ struct gemv_s
|
||||
};
|
||||
typedef struct gemv_s gemv_t;
|
||||
|
||||
#define cntl_sub_gemv( cntl ) cntl->sub_gemv
|
||||
#define cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp
|
||||
#define cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp
|
||||
#define cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp
|
||||
#define cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp
|
||||
#define cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp
|
||||
#define cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp
|
||||
#define bli_cntl_sub_gemv( cntl ) cntl->sub_gemv
|
||||
#define bli_cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp
|
||||
#define bli_cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp
|
||||
#define bli_cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp
|
||||
#define bli_cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp
|
||||
#define bli_cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp
|
||||
#define bli_cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp
|
||||
|
||||
void bli_gemv_cntl_init( void );
|
||||
void bli_gemv_cntl_finalize( void );
|
||||
|
||||
@@ -88,8 +88,8 @@ void bli_gemv_int( trans_t transa,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -60,7 +60,7 @@ void bli_ger_blk_var1( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, m_trans, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and x1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -70,16 +70,16 @@ void bli_ger_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A1 and x1 (if needed).
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x( cntl ) );
|
||||
|
||||
// Copy/pack A1, x1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x( cntl ) );
|
||||
|
||||
// A1 = A1 + alpha * x1 * y;
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -89,17 +89,17 @@ void bli_ger_blk_var1( obj_t* alpha,
|
||||
y,
|
||||
&a1_pack,
|
||||
cntx,
|
||||
cntl_sub_ger( cntl ) );
|
||||
bli_cntl_sub_ger( cntl ) );
|
||||
|
||||
// Copy/unpack A1 (if A1 was packed).
|
||||
bli_unpackm_int( &a1_pack, &a1,
|
||||
cntx, cntl_sub_unpackm_a( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x( cntl ) );
|
||||
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ void bli_ger_blk_var2( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, n_trans, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and y1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -70,16 +70,16 @@ void bli_ger_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A1 and y1 (if needed).
|
||||
bli_packm_init( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y( cntl ) );
|
||||
|
||||
// Copy/pack A1, y1 (if needed).
|
||||
bli_packm_int( &a1, &a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y( cntl ) );
|
||||
|
||||
// A1 = A1 + alpha * x * y1;
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -89,17 +89,17 @@ void bli_ger_blk_var2( obj_t* alpha,
|
||||
&y1_pack,
|
||||
&a1_pack,
|
||||
cntx,
|
||||
cntl_sub_ger( cntl ) );
|
||||
bli_cntl_sub_ger( cntl ) );
|
||||
|
||||
// Copy/unpack A1 (if A1 was packed).
|
||||
bli_unpackm_int( &a1_pack, &a1,
|
||||
cntx, cntl_sub_unpackm_a( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_a( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y( cntl ) );
|
||||
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -45,9 +45,9 @@ struct ger_s
|
||||
};
|
||||
typedef struct ger_s ger_t;
|
||||
|
||||
#define cntl_sub_ger( cntl ) cntl->sub_ger
|
||||
#define cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp
|
||||
#define cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp
|
||||
#define bli_cntl_sub_ger( cntl ) cntl->sub_ger
|
||||
#define bli_cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp
|
||||
#define bli_cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp
|
||||
|
||||
void bli_ger_cntl_init( void );
|
||||
void bli_ger_cntl_finalize( void );
|
||||
|
||||
@@ -107,15 +107,15 @@ void bli_ger_int( conj_t conjx,
|
||||
// If we are about the call a leaf-level implementation, and matrix A
|
||||
// still needs a transposition, then we must induce one by swapping the
|
||||
// strides and dimensions.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( a_local ) )
|
||||
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( a_local ) )
|
||||
{
|
||||
bli_obj_induce_trans( a_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -74,14 +74,14 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
// y = beta * y;
|
||||
bli_scalv_int( beta,
|
||||
y,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -99,20 +99,20 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing A11, x1, and y1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// y0 = y0 + alpha * A10' * x1;
|
||||
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
|
||||
@@ -123,7 +123,7 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y0,
|
||||
cntx,
|
||||
cntl_sub_gemv_t_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_t_rp( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A11 * x1;
|
||||
bli_hemv_int( conjh,
|
||||
@@ -133,7 +133,7 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_hemv( cntl ) );
|
||||
bli_cntl_sub_hemv( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A10 * x0;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -144,17 +144,17 @@ void bli_hemv_blk_var1( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_n_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_n_rp( cntl ) );
|
||||
|
||||
// Copy/unpack y1 (if y1 was packed).
|
||||
bli_unpackv_int( &y1_pack, &y1,
|
||||
cntx, cntl_sub_unpackv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -75,14 +75,14 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
// y = beta * y;
|
||||
bli_scalv_int( beta,
|
||||
y,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, A21, x1, x0, x2, y1, and y0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -102,20 +102,20 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing A11, x1, and y1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A10 * x0;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -126,7 +126,7 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_n_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_n_rp( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A11 * x1;
|
||||
bli_hemv_int( conjh,
|
||||
@@ -136,7 +136,7 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_hemv( cntl ) );
|
||||
bli_cntl_sub_hemv( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A21' * x2;
|
||||
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
|
||||
@@ -147,17 +147,17 @@ void bli_hemv_blk_var2( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_t_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_t_cp( cntl ) );
|
||||
|
||||
// Copy/unpack y1 (if y1 was packed).
|
||||
bli_unpackv_int( &y1_pack, &y1,
|
||||
cntx, cntl_sub_unpackv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -74,14 +74,14 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
// y = beta * y;
|
||||
bli_scalv_int( beta,
|
||||
y,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -99,20 +99,20 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing A11, x1, and y1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A21' * x2;
|
||||
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
|
||||
@@ -123,7 +123,7 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_t_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_t_cp( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A11 * x1;
|
||||
bli_hemv_int( conjh,
|
||||
@@ -133,7 +133,7 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_hemv( cntl ) );
|
||||
bli_cntl_sub_hemv( cntl ) );
|
||||
|
||||
// y2 = y2 + alpha * A21 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -144,17 +144,17 @@ void bli_hemv_blk_var3( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y2,
|
||||
cntx,
|
||||
cntl_sub_gemv_n_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_n_cp( cntl ) );
|
||||
|
||||
// Copy/unpack y1 (if y1 was packed).
|
||||
bli_unpackv_int( &y1_pack, &y1,
|
||||
cntx, cntl_sub_unpackv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -75,14 +75,14 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
// y = beta * y;
|
||||
bli_scalv_int( beta,
|
||||
y,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, A21, x1, y1, y0, and y2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -102,20 +102,20 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing A11, x1, and y1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1, y1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// y0 = y0 + alpha * A10' * x1;
|
||||
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
|
||||
@@ -126,7 +126,7 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y0,
|
||||
cntx,
|
||||
cntl_sub_gemv_t_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_t_rp( cntl ) );
|
||||
|
||||
// y1 = y1 + alpha * A11 * x1;
|
||||
bli_hemv_int( conjh,
|
||||
@@ -136,7 +136,7 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y1_pack,
|
||||
cntx,
|
||||
cntl_sub_hemv( cntl ) );
|
||||
bli_cntl_sub_hemv( cntl ) );
|
||||
|
||||
// y2 = y2 + alpha * A21 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -147,17 +147,17 @@ void bli_hemv_blk_var4( conj_t conjh,
|
||||
&BLIS_ONE,
|
||||
&y2,
|
||||
cntx,
|
||||
cntl_sub_gemv_n_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_n_cp( cntl ) );
|
||||
|
||||
// Copy/unpack y1 (if y1 was packed).
|
||||
bli_unpackv_int( &y1_pack, &y1,
|
||||
cntx, cntl_sub_unpackv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ struct hemv_s
|
||||
};
|
||||
typedef struct hemv_s hemv_t;
|
||||
|
||||
#define cntl_sub_hemv( cntl ) cntl->sub_hemv
|
||||
#define bli_cntl_sub_hemv( cntl ) cntl->sub_hemv
|
||||
|
||||
void bli_hemv_cntl_init( void );
|
||||
void bli_hemv_cntl_finalize( void );
|
||||
|
||||
@@ -96,7 +96,7 @@ void bli_hemv_int( conj_t conjh,
|
||||
// triangular case. But we only need to do this for blocked algorithms,
|
||||
// since unblocked algorithms are responsible for handling the upper case
|
||||
// explicitly (and they should not be inspecting the transposition bit anyway).
|
||||
if ( cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) )
|
||||
if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) )
|
||||
{
|
||||
bli_obj_toggle_conj( a_local );
|
||||
bli_obj_toggle_trans( a_local );
|
||||
@@ -104,8 +104,8 @@ void bli_hemv_int( conj_t conjh,
|
||||
*/
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -71,7 +71,7 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C10, x1, and x0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -85,16 +85,16 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11 and x1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// C10 = C10 + alpha * x1 * x0';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -104,7 +104,7 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
&x0,
|
||||
&c10,
|
||||
cntx,
|
||||
cntl_sub_ger( cntl ) );
|
||||
bli_cntl_sub_ger( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * x1';
|
||||
bli_her_int( conjh,
|
||||
@@ -112,17 +112,17 @@ void bli_her_blk_var1( conj_t conjh,
|
||||
&x1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her( cntl ) );
|
||||
bli_cntl_sub_her( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C21, x1, and x2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -85,16 +85,16 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11 and x1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// C21 = C21 + alpha * x2 * x1';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -104,7 +104,7 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
&x1_pack,
|
||||
&c21,
|
||||
cntx,
|
||||
cntl_sub_ger( cntl ) );
|
||||
bli_cntl_sub_ger( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * x1';
|
||||
bli_her_int( conjh,
|
||||
@@ -112,17 +112,17 @@ void bli_her_blk_var2( conj_t conjh,
|
||||
&x1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her( cntl ) );
|
||||
bli_cntl_sub_her( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ struct her_s
|
||||
};
|
||||
typedef struct her_s her_t;
|
||||
|
||||
#define cntl_sub_her( cntl ) cntl->sub_her
|
||||
#define bli_cntl_sub_her( cntl ) cntl->sub_her
|
||||
|
||||
void bli_her_cntl_init( void );
|
||||
void bli_her_cntl_finalize( void );
|
||||
|
||||
@@ -92,8 +92,8 @@ void bli_her_int( conj_t conjh,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -76,7 +76,7 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C10, x1, x0, y1, and y0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -94,20 +94,20 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11, x1, and y1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// C10 = C10 + alpha * x1 * y0';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -117,7 +117,7 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
&y0,
|
||||
&c10,
|
||||
cntx,
|
||||
cntl_sub_ger_rp( cntl ) );
|
||||
bli_cntl_sub_ger_rp( cntl ) );
|
||||
|
||||
// C10 = C10 + conj(alpha) * y1 * x0';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -127,7 +127,7 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
&x0,
|
||||
&c10,
|
||||
cntx,
|
||||
cntl_sub_ger_rp( cntl ) );
|
||||
bli_cntl_sub_ger_rp( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
|
||||
bli_her2_int( conjh,
|
||||
@@ -137,18 +137,18 @@ void bli_her2_blk_var1( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her2( cntl ) );
|
||||
bli_cntl_sub_her2( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C10, C21, x1, x0, x2, and y1.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -97,20 +97,20 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11, x1, and y1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// C10 = C10 + conj(alpha) * y1 * x0';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -120,7 +120,7 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
&x0,
|
||||
&c10,
|
||||
cntx,
|
||||
cntl_sub_ger_rp( cntl ) );
|
||||
bli_cntl_sub_ger_rp( cntl ) );
|
||||
|
||||
// C21 = C21 + alpha * x2 * y1';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -130,7 +130,7 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c21,
|
||||
cntx,
|
||||
cntl_sub_ger_cp( cntl ) );
|
||||
bli_cntl_sub_ger_cp( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
|
||||
bli_her2_int( conjh,
|
||||
@@ -140,18 +140,18 @@ void bli_her2_blk_var2( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her2( cntl ) );
|
||||
bli_cntl_sub_her2( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C10, C21, x1, y1, y0, and y2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -97,20 +97,20 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11, x1, and y1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// C10 = C10 + alpha * x1 * y0';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -120,7 +120,7 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
&y0,
|
||||
&c10,
|
||||
cntx,
|
||||
cntl_sub_ger_rp( cntl ) );
|
||||
bli_cntl_sub_ger_rp( cntl ) );
|
||||
|
||||
// C21 = C21 + conj(alpha) * y2 * x1';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -130,7 +130,7 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
&x1_pack,
|
||||
&c21,
|
||||
cntx,
|
||||
cntl_sub_ger_cp( cntl ) );
|
||||
bli_cntl_sub_ger_cp( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
|
||||
bli_her2_int( conjh,
|
||||
@@ -140,18 +140,18 @@ void bli_her2_blk_var3( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her2( cntl ) );
|
||||
bli_cntl_sub_her2( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, c,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for C11, C21, x1, x2, y1, and y2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -94,20 +94,20 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
|
||||
// Initialize objects for packing C11, x1, and y1 (if needed).
|
||||
bli_packm_init( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_init( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// Copy/pack C11, x1, y1 (if needed).
|
||||
bli_packm_int( &c11, &c11_pack,
|
||||
cntx, cntl_sub_packm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_int( &y1, &y1_pack,
|
||||
cntx, cntl_sub_packv_y1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_y1( cntl ) );
|
||||
|
||||
// C21 = C21 + alpha * x2 * y1';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -117,7 +117,7 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c21,
|
||||
cntx,
|
||||
cntl_sub_ger_cp( cntl ) );
|
||||
bli_cntl_sub_ger_cp( cntl ) );
|
||||
|
||||
// C21 = C21 + conj(alpha) * y2 * x1';
|
||||
bli_ger_int( BLIS_NO_CONJUGATE,
|
||||
@@ -127,7 +127,7 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
&x1_pack,
|
||||
&c21,
|
||||
cntx,
|
||||
cntl_sub_ger_cp( cntl ) );
|
||||
bli_cntl_sub_ger_cp( cntl ) );
|
||||
|
||||
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
|
||||
bli_her2_int( conjh,
|
||||
@@ -137,18 +137,18 @@ void bli_her2_blk_var4( conj_t conjh,
|
||||
&y1_pack,
|
||||
&c11_pack,
|
||||
cntx,
|
||||
cntl_sub_her2( cntl ) );
|
||||
bli_cntl_sub_her2( cntl ) );
|
||||
|
||||
// Copy/unpack C11 (if C11 was packed).
|
||||
bli_unpackm_int( &c11_pack, &c11,
|
||||
cntx, cntl_sub_unpackm_c11( cntl ),
|
||||
cntx, bli_cntl_sub_unpackm_c11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
|
||||
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ struct her2_s
|
||||
};
|
||||
typedef struct her2_s her2_t;
|
||||
|
||||
#define cntl_sub_her2( cntl ) cntl->sub_her2
|
||||
#define bli_cntl_sub_her2( cntl ) cntl->sub_her2
|
||||
|
||||
void bli_her2_cntl_init( void );
|
||||
void bli_her2_cntl_finalize( void );
|
||||
|
||||
@@ -115,8 +115,8 @@ void bli_her2_int( conj_t conjh,
|
||||
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -46,7 +46,7 @@ struct trmv_s
|
||||
};
|
||||
typedef struct trmv_s trmv_t;
|
||||
|
||||
#define cntl_sub_trmv( cntl ) cntl->sub_trmv
|
||||
#define bli_cntl_sub_trmv( cntl ) cntl->sub_trmv
|
||||
|
||||
void bli_trmv_cntl_init( void );
|
||||
void bli_trmv_cntl_finalize( void );
|
||||
|
||||
@@ -115,8 +115,8 @@ void bli_trmv_int( obj_t* alpha,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[uplo][n][i];
|
||||
|
||||
@@ -61,7 +61,7 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, x1, and x0.
|
||||
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
@@ -75,23 +75,23 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = alpha * tril( A11 ) * x1;
|
||||
bli_trmv_int( alpha,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trmv( cntl ) );
|
||||
bli_cntl_sub_trmv( cntl ) );
|
||||
|
||||
// x1 = x1 + alpha * A10 * x0;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -102,16 +102,16 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_rp( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A21, x1, and x2.
|
||||
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
@@ -75,16 +75,16 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x2 = x2 + alpha * A21 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -95,23 +95,23 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x2,
|
||||
cntx,
|
||||
cntl_sub_gemv_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_cp( cntl ) );
|
||||
|
||||
// x1 = alpha * tril( A11 ) * x1;
|
||||
bli_trmv_int( alpha,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trmv( cntl ) );
|
||||
bli_cntl_sub_trmv( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A12, x1, and x2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -75,23 +75,23 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = alpha * triu( A11 ) * x1;
|
||||
bli_trmv_int( alpha,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trmv( cntl ) );
|
||||
bli_cntl_sub_trmv( cntl ) );
|
||||
|
||||
// x1 = x1 + alpha * A12 * x2;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -102,16 +102,16 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_rp( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A21, x1, and x2.
|
||||
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
@@ -75,16 +75,16 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x0 = x0 + alpha * A01 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -95,23 +95,23 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x0,
|
||||
cntx,
|
||||
cntl_sub_gemv_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_cp( cntl ) );
|
||||
|
||||
// x1 = alpha * triu( A11 ) * x1;
|
||||
bli_trmv_int( alpha,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trmv( cntl ) );
|
||||
bli_cntl_sub_trmv( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ struct trsv_s
|
||||
};
|
||||
typedef struct trsv_s trsv_t;
|
||||
|
||||
#define cntl_sub_trsv( cntl ) cntl->sub_trsv
|
||||
#define bli_cntl_sub_trsv( cntl ) cntl->sub_trsv
|
||||
|
||||
void bli_trsv_cntl_init( void );
|
||||
void bli_trsv_cntl_finalize( void );
|
||||
|
||||
@@ -115,8 +115,8 @@ void bli_trsv_int( obj_t* alpha,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[uplo][n][i];
|
||||
|
||||
@@ -59,14 +59,14 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
|
||||
// x = alpha * x;
|
||||
bli_scalv_int( alpha,
|
||||
x,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A10, x1, and x0.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -80,16 +80,16 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = x1 - A10 * x0;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -100,23 +100,23 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_rp( cntl ) );
|
||||
|
||||
// x1 = x1 / tril( A11 );
|
||||
bli_trsv_int( &BLIS_ONE,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trsv( cntl ) );
|
||||
bli_cntl_sub_trsv( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -59,14 +59,14 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
|
||||
// x = alpha * x;
|
||||
bli_scalv_int( alpha,
|
||||
x,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A21, x1, and x2.
|
||||
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
|
||||
@@ -80,23 +80,23 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = x1 / tril( A11 );
|
||||
bli_trsv_int( &BLIS_ONE,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trsv( cntl ) );
|
||||
bli_cntl_sub_trsv( cntl ) );
|
||||
|
||||
// x2 = x2 - A21 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -107,16 +107,16 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x2,
|
||||
cntx,
|
||||
cntl_sub_gemv_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_cp( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -59,14 +59,14 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
|
||||
// x = alpha * x;
|
||||
bli_scalv_int( alpha,
|
||||
x,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A12, x1, and x2.
|
||||
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
@@ -80,16 +80,16 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = x1 - A12 * x2;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -100,23 +100,23 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemv_rp( cntl ) );
|
||||
bli_cntl_sub_gemv_rp( cntl ) );
|
||||
|
||||
// x1 = x1 / tril( A11 );
|
||||
bli_trsv_int( &BLIS_ONE,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trsv( cntl ) );
|
||||
bli_cntl_sub_trsv( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -59,14 +59,14 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
|
||||
// x = alpha * x;
|
||||
bli_scalv_int( alpha,
|
||||
x,
|
||||
cntx, cntl_sub_scalv( cntl ) );
|
||||
cntx, bli_cntl_sub_scalv( cntl ) );
|
||||
|
||||
// Partition diagonally.
|
||||
for ( ij = 0; ij < mn; ij += b_alg )
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( ij, mn, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A11, A01, x1, and x0.
|
||||
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
|
||||
@@ -80,23 +80,23 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
|
||||
|
||||
// Initialize objects for packing A11 and x1 (if needed).
|
||||
bli_packm_init( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_init( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// Copy/pack A11, x1 (if needed).
|
||||
bli_packm_int( &a11, &a11_pack,
|
||||
cntx, cntl_sub_packm_a11( cntl ),
|
||||
cntx, bli_cntl_sub_packm_a11( cntl ),
|
||||
&BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packv_int( &x1, &x1_pack,
|
||||
cntx, cntl_sub_packv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_packv_x1( cntl ) );
|
||||
|
||||
// x1 = x1 / tril( A11 );
|
||||
bli_trsv_int( &BLIS_ONE,
|
||||
&a11_pack,
|
||||
&x1_pack,
|
||||
cntx,
|
||||
cntl_sub_trsv( cntl ) );
|
||||
bli_cntl_sub_trsv( cntl ) );
|
||||
|
||||
// x0 = x0 - A01 * x1;
|
||||
bli_gemv_int( BLIS_NO_TRANSPOSE,
|
||||
@@ -107,16 +107,16 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
|
||||
&BLIS_ONE,
|
||||
&x0,
|
||||
cntx,
|
||||
cntl_sub_gemv_cp( cntl ) );
|
||||
bli_cntl_sub_gemv_cp( cntl ) );
|
||||
|
||||
// Copy/unpack x1 (if x1 was packed).
|
||||
bli_unpackv_int( &x1_pack, &x1,
|
||||
cntx, cntl_sub_unpackv_x1( cntl ) );
|
||||
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
|
||||
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
|
||||
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -66,3 +66,4 @@
|
||||
#include "bli_trmm.h"
|
||||
#include "bli_trmm3.h"
|
||||
#include "bli_trsm.h"
|
||||
|
||||
|
||||
336
frame/3/bli_l3_thrinfo.c
Normal file
336
frame/3/bli_l3_thrinfo.c
Normal file
@@ -0,0 +1,336 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
thrinfo_t* bli_l3_thrinfo_create
|
||||
(
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id,
|
||||
thrinfo_t* opackm,
|
||||
thrinfo_t* ipackm,
|
||||
thrinfo_t* sub_self
|
||||
)
|
||||
{
|
||||
return bli_thrinfo_create
|
||||
(
|
||||
ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way,
|
||||
work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_self
|
||||
);
|
||||
}
|
||||
|
||||
void bli_l3_thrinfo_init
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id,
|
||||
thrinfo_t* opackm,
|
||||
thrinfo_t* ipackm,
|
||||
thrinfo_t* sub_self
|
||||
)
|
||||
{
|
||||
bli_thrinfo_init
|
||||
(
|
||||
thread,
|
||||
ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way,
|
||||
work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_self
|
||||
);
|
||||
}
|
||||
|
||||
void bli_l3_thrinfo_init_single
|
||||
(
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bli_thrinfo_init_single( thread );
|
||||
}
|
||||
|
||||
void bli_l3_thrinfo_free
|
||||
(
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
if ( thread == NULL ||
|
||||
thread == &BLIS_GEMM_SINGLE_THREADED ||
|
||||
thread == &BLIS_HERK_SINGLE_THREADED
|
||||
) return;
|
||||
|
||||
// Free Communicators
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_thrcomm_free( thread->ocomm );
|
||||
if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) )
|
||||
bli_thrcomm_free( thread->icomm );
|
||||
|
||||
// Free thrinfo chidren
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_l3_thrinfo_free( thread->sub_self );
|
||||
bli_free_intl( thread );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
thrinfo_t** bli_l3_thrinfo_create_paths
|
||||
(
|
||||
opid_t l3_op,
|
||||
side_t side
|
||||
)
|
||||
{
|
||||
dim_t jc_in, jc_way;
|
||||
dim_t kc_in, kc_way;
|
||||
dim_t ic_in, ic_way;
|
||||
dim_t jr_in, jr_way;
|
||||
dim_t ir_in, ir_way;
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
jc_in = bli_env_read_nway( "BLIS_JC_NT" );
|
||||
//kc_way = bli_env_read_nway( "BLIS_KC_NT" );
|
||||
kc_in = 1;
|
||||
ic_in = bli_env_read_nway( "BLIS_IC_NT" );
|
||||
jr_in = bli_env_read_nway( "BLIS_JR_NT" );
|
||||
ir_in = bli_env_read_nway( "BLIS_IR_NT" );
|
||||
#else
|
||||
jc_in = 1;
|
||||
kc_in = 1;
|
||||
ic_in = 1;
|
||||
jr_in = 1;
|
||||
ir_in = 1;
|
||||
#endif
|
||||
|
||||
if ( l3_op == BLIS_TRMM )
|
||||
{
|
||||
// We reconfigure the parallelism for trmm_r due to a dependency in
|
||||
// the jc loop. (NOTE: This dependency does not exist for trmm3.)
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
jc_way = 1;
|
||||
kc_way = kc_in;
|
||||
ic_way = ic_in;
|
||||
jr_way = jr_in * jc_in;
|
||||
ir_way = ir_in;
|
||||
}
|
||||
else // if ( bli_is_left( side ) )
|
||||
{
|
||||
jc_way = jc_in;
|
||||
kc_way = kc_in;
|
||||
ic_way = ic_in;
|
||||
jr_way = jr_in;
|
||||
ir_way = ir_in;
|
||||
}
|
||||
}
|
||||
else if ( l3_op == BLIS_TRSM )
|
||||
{
|
||||
if ( bli_is_right( side ) )
|
||||
{
|
||||
|
||||
jc_way = 1;
|
||||
kc_way = 1;
|
||||
ic_way = jc_in * ic_in * jr_in;
|
||||
jr_way = 1;
|
||||
ir_way = 1;
|
||||
}
|
||||
else // if ( bli_is_left( side ) )
|
||||
{
|
||||
jc_way = 1;
|
||||
kc_way = 1;
|
||||
ic_way = 1;
|
||||
jr_way = ic_in * jr_in * ir_in;
|
||||
ir_way = 1;
|
||||
}
|
||||
}
|
||||
else // all other level-3 operations
|
||||
{
|
||||
jc_way = jc_in;
|
||||
kc_way = kc_in;
|
||||
ic_way = ic_in;
|
||||
jr_way = jr_in;
|
||||
ir_way = ir_in;
|
||||
}
|
||||
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
|
||||
dim_t kc_nt = ic_way * jr_way * ir_way;
|
||||
dim_t ic_nt = jr_way * ir_way;
|
||||
dim_t jr_nt = ir_way;
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
|
||||
|
||||
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
|
||||
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
|
||||
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
|
||||
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
|
||||
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
|
||||
|
||||
for( int e = 0; e < ir_way; e++ )
|
||||
{
|
||||
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
// Macrokernel loops
|
||||
thrinfo_t* ir_info
|
||||
=
|
||||
bli_l3_thrinfo_create( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
thrinfo_t* jr_info
|
||||
=
|
||||
bli_l3_thrinfo_create( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info );
|
||||
//blk_var_1
|
||||
thrinfo_t* pack_ic_in
|
||||
=
|
||||
bli_packm_thrinfo_create( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
thrinfo_t* pack_ic_out
|
||||
=
|
||||
bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
thrinfo_t* ic_info
|
||||
=
|
||||
bli_l3_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
pack_ic_out, pack_ic_in, jr_info );
|
||||
//blk_var_3
|
||||
thrinfo_t* pack_kc_in
|
||||
=
|
||||
bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
thrinfo_t* pack_kc_out
|
||||
=
|
||||
bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
thrinfo_t* kc_info
|
||||
=
|
||||
bli_l3_thrinfo_create( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
pack_kc_out, pack_kc_in, ic_info );
|
||||
//blk_var_2
|
||||
thrinfo_t* pack_jc_in
|
||||
=
|
||||
bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
thrinfo_t* pack_jc_out
|
||||
=
|
||||
bli_packm_thrinfo_create( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
thrinfo_t* jc_info
|
||||
=
|
||||
bli_l3_thrinfo_create( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
pack_jc_out, pack_jc_in, kc_info );
|
||||
|
||||
paths[global_comm_id] = jc_info;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return paths;
|
||||
}
|
||||
|
||||
void bli_l3_thrinfo_free_paths
|
||||
(
|
||||
thrinfo_t** threads,
|
||||
dim_t num
|
||||
)
|
||||
{
|
||||
dim_t i;
|
||||
|
||||
for ( i = 0; i < num; ++i )
|
||||
bli_l3_thrinfo_free( threads[i] );
|
||||
|
||||
bli_free_intl( threads );
|
||||
}
|
||||
|
||||
@@ -32,48 +32,83 @@
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// thrinfo_t macros specific to various level-3 operations.
|
||||
//
|
||||
|
||||
struct trmm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
// gemm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct trmm_thrinfo_s* sub_trmm;
|
||||
};
|
||||
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
|
||||
// herk
|
||||
|
||||
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
|
||||
#define trmm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
// trmm
|
||||
|
||||
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency );
|
||||
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads );
|
||||
// trsm
|
||||
|
||||
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trmm_thrinfo_t* sub_trmm );
|
||||
//
|
||||
// thrinfo_t APIs specific to level-3 operations.
|
||||
//
|
||||
|
||||
thrinfo_t* bli_l3_thrinfo_create
|
||||
(
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id,
|
||||
thrinfo_t* opackm,
|
||||
thrinfo_t* ipackm,
|
||||
thrinfo_t* sub_self
|
||||
);
|
||||
|
||||
void bli_l3_thrinfo_init
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
thrcomm_t* ocomm,
|
||||
dim_t ocomm_id,
|
||||
thrcomm_t* icomm,
|
||||
dim_t icomm_id,
|
||||
dim_t n_way,
|
||||
dim_t work_id,
|
||||
thrinfo_t* opackm,
|
||||
thrinfo_t* ipackm,
|
||||
thrinfo_t* sub_self
|
||||
);
|
||||
|
||||
void bli_l3_thrinfo_init_single
|
||||
(
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_l3_thrinfo_free
|
||||
(
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
thrinfo_t** bli_l3_thrinfo_create_paths
|
||||
(
|
||||
opid_t l3_op,
|
||||
side_t side
|
||||
);
|
||||
|
||||
void bli_l3_thrinfo_free_paths
|
||||
(
|
||||
thrinfo_t** threads,
|
||||
dim_t num
|
||||
);
|
||||
|
||||
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );
|
||||
@@ -39,7 +39,7 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
//The s is for "lives on the stack"
|
||||
obj_t b_pack_s;
|
||||
@@ -53,36 +53,36 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
dim_t i;
|
||||
dim_t b_alg;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize objects passed into bli_packm_init for A and C
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_t2b( thread, a,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_t2b( thread, a,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
@@ -93,7 +93,7 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -102,23 +102,23 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -127,26 +127,26 @@ void bli_gemm_blk_var1f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
@@ -53,35 +53,35 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
dim_t b_alg;
|
||||
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_l2r( thread, b,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_l2r( thread, b,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
@@ -92,7 +92,7 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, b,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -101,23 +101,23 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -126,26 +126,26 @@ void bli_gemm_blk_var2f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, b1_pack_s;
|
||||
@@ -53,31 +53,31 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
dim_t b_alg;
|
||||
dim_t k_trans;
|
||||
|
||||
if( thread_am_ochief( thread ) ){
|
||||
if( bli_thread_am_ochief( thread ) ){
|
||||
// Initialize object for packing C
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
c_pack = bli_thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize pack objects for A and B that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ){
|
||||
if( bli_thread_am_ichief( thread ) ){
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
@@ -90,7 +90,7 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
// the kc blocksize so that we can implement the "nudging" of kc
|
||||
// to be a multiple of mr or nr, as needed.
|
||||
b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and B1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -99,23 +99,23 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -124,8 +124,8 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
@@ -133,25 +133,25 @@ void bli_gemm_blk_var3f( obj_t* a,
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
bli_thread_ibarrier( thread );
|
||||
if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ struct gemm_s
|
||||
};
|
||||
typedef struct gemm_s gemm_t;
|
||||
|
||||
#define cntl_sub_gemm( cntl ) cntl->sub_gemm
|
||||
#define bli_cntl_sub_gemm( cntl ) cntl->sub_gemm
|
||||
|
||||
void bli_gemm_cntl_init( void );
|
||||
void bli_gemm_cntl_finalize( void );
|
||||
|
||||
@@ -79,11 +79,11 @@ void bli_gemm_front( obj_t* alpha,
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -94,7 +94,7 @@ void bli_gemm_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[6][3] =
|
||||
{
|
||||
@@ -61,7 +61,7 @@ void bli_gemm_int( obj_t* alpha,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
@@ -82,9 +82,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *b ) )
|
||||
{
|
||||
if( thread_am_ochief( thread ) )
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -93,9 +93,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
if ( bli_obj_is_zeros( *a ) ||
|
||||
bli_obj_is_zeros( *b ) )
|
||||
{
|
||||
if( thread_am_ochief( thread ) )
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -111,9 +111,9 @@ void bli_gemm_int( obj_t* alpha,
|
||||
// strides and dimensions. Note that this transposition would normally
|
||||
// be handled explicitly in the packing of C, but if C is not being
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
//if( thread_am_ochief( thread ) ) {
|
||||
//if( bli_thread_am_ochief( thread ) ) {
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
// }
|
||||
@@ -134,8 +134,8 @@ void bli_gemm_int( obj_t* alpha,
|
||||
}
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[n][i];
|
||||
|
||||
@@ -39,5 +39,5 @@ void bli_gemm_int( obj_t* alpha,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
gemm_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
@@ -61,7 +61,7 @@ void bli_gemm_ker_var2( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -236,11 +236,11 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, aux ); \
|
||||
\
|
||||
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct gemm_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct gemm_thrinfo_s* sub_gemm;
|
||||
};
|
||||
typedef struct gemm_thrinfo_s gemm_thrinfo_t;
|
||||
|
||||
#define gemm_thread_sub_gemm( thread ) thread->sub_gemm
|
||||
#define gemm_thread_sub_opackm( thread ) thread->opackm
|
||||
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in gemm micro-kernel
|
||||
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
|
||||
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads );
|
||||
|
||||
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm );
|
||||
|
||||
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
gemm_thrinfo_t* sub_gemm );
|
||||
|
||||
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread );
|
||||
@@ -47,7 +47,7 @@ void PASTEMAC0(opname) \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_t* cntl, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENPROT( gemm_blk_var1f )
|
||||
@@ -84,7 +84,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ker_var2 )
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
//The s is for "lives on the stack"
|
||||
obj_t b_pack_s;
|
||||
@@ -58,37 +58,37 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
cntx_t cntx_io = *cntx;
|
||||
cntx_t cntx_rpi = *cntx;
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by
|
||||
// chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize objects passed into bli_packm_init for A and C
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
gemm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_t2b( thread, a,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_t2b( thread, a,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
@@ -99,7 +99,7 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
// This causes the right blocksize to be used if c and a are
|
||||
// complex and b is real.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -112,23 +112,23 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
bli_gemm3m3_cntx_stage( 0, &cntx_ro );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
&cntx_ro, cntl_sub_packm_a( cntl ) );
|
||||
&cntx_ro, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
&cntx_ro, cntl_sub_packm_c( cntl ) );
|
||||
&cntx_ro, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
&cntx_ro, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
&cntx_ro, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
&cntx_ro, cntl_sub_packm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
&cntx_ro, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem (real-only).
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -137,30 +137,30 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
|
||||
// Only apply beta within the first of three subproblems.
|
||||
if ( thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack );
|
||||
if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack );
|
||||
|
||||
|
||||
// Initialize the context for the imag-only stage.
|
||||
bli_gemm3m3_cntx_stage( 1, &cntx_io );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
&cntx_io, cntl_sub_packm_a( cntl ) );
|
||||
&cntx_io, bli_cntl_sub_packm_a( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
&cntx_io, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
&cntx_io, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem (imag-only).
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -169,26 +169,26 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
|
||||
// Initialize the context for the real+imag stage.
|
||||
bli_gemm3m3_cntx_stage( 2, &cntx_rpi );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
&cntx_rpi, cntl_sub_packm_a( cntl ) );
|
||||
&cntx_rpi, bli_cntl_sub_packm_a( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
&cntx_rpi, cntl_sub_packm_a( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
&cntx_rpi, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform gemm subproblem (real+imag).
|
||||
bli_gemm_int( &BLIS_ONE,
|
||||
@@ -197,30 +197,30 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
gemm_thread_sub_gemm( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
// Currently must be done by 1 thread
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
gemm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ){
|
||||
// It doesn't matter which packm cntl node we pass in, as long
|
||||
// as it is valid, packm_release() will release the mem_t entry
|
||||
// stored in a1_pack.
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,5 +37,5 @@ void bli_gemm_blk_var4f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
gemm_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3);
|
||||
@@ -61,7 +61,7 @@ void bli_gemm_ker_var3( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -238,11 +238,11 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, aux ); \
|
||||
\
|
||||
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
|
||||
@@ -41,7 +41,7 @@ void bli_gemm_ker_var3( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -65,7 +65,7 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
void* gemm_ukr, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ker_var3 )
|
||||
|
||||
@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
gemm_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4);
|
||||
@@ -61,7 +61,7 @@ void bli_gemm_ker_var4( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -238,11 +238,11 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, aux ); \
|
||||
\
|
||||
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
|
||||
@@ -41,7 +41,7 @@ void bli_gemm_ker_var4( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
gemm_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
//
|
||||
@@ -65,7 +65,7 @@ void PASTEMAC(ch,varname)( \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
void* gemm_ukr, \
|
||||
gemm_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm_ker_var4 )
|
||||
|
||||
@@ -35,98 +35,25 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_herk = sub_herk;
|
||||
}
|
||||
|
||||
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_herk = thread;
|
||||
}
|
||||
|
||||
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk )
|
||||
{
|
||||
herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc_intl( sizeof( herk_thrinfo_t ) );
|
||||
bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_herk );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_herk_thrinfo_free( herk_thrinfo_t* thread)
|
||||
{
|
||||
if( thread == NULL ) return;
|
||||
|
||||
// Free Communicators
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_free_communicator( thread->ocomm );
|
||||
if( thread->sub_herk == NULL && thread_am_ichief( thread ) )
|
||||
bli_free_communicator( thread->icomm );
|
||||
|
||||
// Free Sub Thrinfos
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_herk_thrinfo_free( thread->sub_herk );
|
||||
bli_free_intl( thread );
|
||||
|
||||
return;
|
||||
}
|
||||
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num )
|
||||
{
|
||||
for( int i = 0; i < num; i++)
|
||||
bli_herk_thrinfo_free( threads[i] );
|
||||
bli_free_intl( threads );
|
||||
}
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
#if 0
|
||||
thrinfo_t** bli_gemm_thrinfo_create_paths( void )
|
||||
{
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" );
|
||||
// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" );
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" );
|
||||
#else
|
||||
dim_t jc_way = 1;
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t jr_way = 1;
|
||||
dim_t ir_way = 1;
|
||||
#endif
|
||||
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
@@ -137,78 +64,77 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
herk_thrinfo_t** paths = (herk_thrinfo_t**) bli_malloc_intl( global_num_threads * sizeof( herk_thrinfo_t* ) );
|
||||
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
{
|
||||
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
{
|
||||
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
{
|
||||
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
{
|
||||
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++ )
|
||||
{
|
||||
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
// Macrokernel loops
|
||||
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
|
||||
|
||||
// Macrokernel loops
|
||||
thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
|
||||
thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
//blk_var_1
|
||||
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
|
||||
thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
ic_way, c,
|
||||
pack_ic_out, pack_ic_in, jr_info);
|
||||
//blk_var_3
|
||||
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
|
||||
thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
pack_kc_out, pack_kc_in, ic_info);
|
||||
|
||||
//blk_var_2
|
||||
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
|
||||
packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
|
||||
thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
pack_jc_out, pack_jc_in, kc_info);
|
||||
@@ -221,3 +147,4 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
#endif
|
||||
44
frame/3/gemm/old/bli_gemm_thread.h
Normal file
44
frame/3/gemm/old/bli_gemm_thread.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define bli_thrinfo_sub_self( thread ) thread->sub_l3op
|
||||
#define bli_thrinfo_sub_opackm( thread ) thread->opackm
|
||||
#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in gemm micro-kernel
|
||||
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
//thrinfo_t** bli_gemm_thrinfo_create_paths( void );
|
||||
|
||||
@@ -86,11 +86,11 @@ void bli_hemm_front( side_t side,
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -101,7 +101,7 @@ void bli_hemm_front( side_t side,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -118,11 +118,11 @@ void bli_her2k_front( obj_t* alpha,
|
||||
#else
|
||||
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -133,7 +133,7 @@ void bli_her2k_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
&alpha_conj,
|
||||
&b_local,
|
||||
@@ -144,7 +144,7 @@ void bli_her2k_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t ah_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
@@ -55,36 +55,36 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_m( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A'.
|
||||
bli_obj_init_pack( &ah_pack_s );
|
||||
bli_packm_init( ah, &ah_pack_s,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
ah_pack = thread_obroadcast( thread, &ah_pack_s );
|
||||
ah_pack = bli_thread_obroadcast( thread, &ah_pack_s );
|
||||
|
||||
// Initialize pack objects that are passed into packm_init() for A and C.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A' (if instructed).
|
||||
bli_packm_int( ah, ah_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_t2b( thread, c,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_weighted_t2b( thread, c,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
@@ -92,7 +92,7 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -101,23 +101,23 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
@@ -126,25 +126,25 @@ void bli_herk_blk_var1f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( ah_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( ah_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t ah1_pack_s, c1_pack_s;
|
||||
@@ -55,35 +55,35 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_n( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for C and A' that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &ah1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_l2r( thread, c,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_weighted_l2r( thread, c,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
@@ -91,7 +91,7 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1' and C1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -100,23 +100,23 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1' and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &ah1, ah1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread ) ;
|
||||
bli_thread_ibarrier( thread ) ;
|
||||
|
||||
// Pack A1' (if instructed).
|
||||
bli_packm_int( &ah1, ah1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) ) ;
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) ) ;
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
@@ -125,25 +125,25 @@ void bli_herk_blk_var2f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t c_pack_s;
|
||||
obj_t a1_pack_s, ah1_pack_s;
|
||||
@@ -56,31 +56,31 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_herk_prune_unref_mparts_k( a, ah, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing C.
|
||||
bli_obj_init_pack( &c_pack_s );
|
||||
bli_packm_init( c, &c_pack_s,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
c_pack = thread_obroadcast( thread, &c_pack_s );
|
||||
c_pack = bli_thread_obroadcast( thread, &c_pack_s );
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &ah1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s );
|
||||
|
||||
// Pack C (if instructed).
|
||||
bli_packm_int( c, c_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
// Query dimension in partitioning direction.
|
||||
k_trans = bli_obj_width_after_trans( *a );
|
||||
@@ -90,7 +90,7 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, k_trans, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and A1'.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -99,23 +99,23 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
i, b_alg, ah, &ah1 );
|
||||
|
||||
// Initialize objects for packing A1 and A1'.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &ah1, ah1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &ah1, ah1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
herk_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform herk subproblem.
|
||||
bli_herk_int( &BLIS_ONE,
|
||||
@@ -124,8 +124,8 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
herk_thread_sub_herk( thread ) );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
|
||||
// This variant executes multiple rank-k updates. Therefore, if the
|
||||
// internal beta scalar on matrix C is non-zero, we must use it
|
||||
@@ -133,26 +133,26 @@ void bli_herk_blk_var3f( obj_t* a,
|
||||
// And since c_pack is a local obj_t, we can simply overwrite the
|
||||
// internal beta scalar with BLIS_ONE once it has been used in the
|
||||
// first iteration.
|
||||
thread_ibarrier( thread );
|
||||
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
bli_thread_ibarrier( thread );
|
||||
if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
|
||||
|
||||
}
|
||||
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
|
||||
// Unpack C (if C was packed).
|
||||
bli_unpackm_int( c_pack, c,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
herk_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) );
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -84,11 +84,11 @@ void bli_herk_front( obj_t* alpha,
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -99,7 +99,7 @@ void bli_herk_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
// The Hermitian rank-k product was computed as A*A', even for the
|
||||
// diagonal elements. Mathematically, the imaginary components of
|
||||
|
||||
@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
static FUNCPTR_T vars[2][4][3] =
|
||||
{
|
||||
@@ -70,7 +70,7 @@ void bli_herk_int( obj_t* alpha,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t ah_local;
|
||||
@@ -91,9 +91,9 @@ void bli_herk_int( obj_t* alpha,
|
||||
if ( bli_obj_has_zero_dim( *a ) ||
|
||||
bli_obj_has_zero_dim( *ah ) )
|
||||
{
|
||||
if( thread_am_ochief( thread ) )
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
thread_obarrier( thread );
|
||||
bli_thread_obarrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -109,7 +109,7 @@ void bli_herk_int( obj_t* alpha,
|
||||
// strides and dimensions. Note that this transposition would normally
|
||||
// be handled explicitly in the packing of C, but if C is not being
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
|
||||
{
|
||||
bli_obj_induce_trans( c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
|
||||
@@ -134,8 +134,8 @@ void bli_herk_int( obj_t* alpha,
|
||||
else uplo = 1;
|
||||
|
||||
// Extract the variant number and implementation type.
|
||||
n = cntl_var_num( cntl );
|
||||
i = cntl_impl_type( cntl );
|
||||
n = bli_cntl_var_num( cntl );
|
||||
i = bli_cntl_impl_type( cntl );
|
||||
|
||||
// Index into the variant array to extract the correct function pointer.
|
||||
f = vars[uplo][n][i];
|
||||
|
||||
@@ -39,5 +39,5 @@ void bli_herk_int( obj_t* alpha,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread );
|
||||
thrinfo_t* thread );
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
herk_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
@@ -62,7 +62,7 @@ void bli_herk_l_ker_var2( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -151,7 +151,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
herk_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -270,11 +270,11 @@ void PASTEMAC(ch,varname) \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
struct herk_thrinfo_s //implements thrinfo_t
|
||||
{
|
||||
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t ocomm_id; //Our thread id within that thread comm
|
||||
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
|
||||
dim_t icomm_id; //Our thread id within that thread comm
|
||||
|
||||
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
|
||||
dim_t work_id; //What we're working on
|
||||
|
||||
packm_thrinfo_t* opackm;
|
||||
packm_thrinfo_t* ipackm;
|
||||
struct herk_thrinfo_s* sub_herk;
|
||||
};
|
||||
typedef struct herk_thrinfo_s herk_thrinfo_t;
|
||||
|
||||
#define herk_thread_sub_herk( thread ) thread->sub_herk
|
||||
#define herk_thread_sub_opackm( thread ) thread->opackm
|
||||
#define herk_thread_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in herk micro-kernel
|
||||
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
|
||||
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
|
||||
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads );
|
||||
|
||||
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk );
|
||||
|
||||
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
herk_thrinfo_t* sub_herk );
|
||||
|
||||
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread );
|
||||
@@ -51,7 +51,7 @@ typedef void (*FUNCPTR_T)(
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
herk_thrinfo_t* thread
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
@@ -62,7 +62,7 @@ void bli_herk_u_ker_var2( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
herk_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
num_t dt_exec = bli_obj_execution_datatype( *c );
|
||||
|
||||
@@ -151,7 +151,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
herk_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -270,11 +270,11 @@ void PASTEMAC(ch,varname) \
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
|
||||
dim_t jr_num_threads = thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = thread_work_id( caucus ); \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
|
||||
@@ -47,7 +47,7 @@ void PASTEMAC0(opname) \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
gemm_t* cntl, \
|
||||
herk_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENPROT( herk_blk_var1f )
|
||||
@@ -81,7 +81,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
herk_thrinfo_t* thread \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( herk_l_ker_var2 )
|
||||
|
||||
@@ -35,105 +35,25 @@
|
||||
#include "blis.h"
|
||||
#include "assert.h"
|
||||
|
||||
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
|
||||
thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm )
|
||||
#if 0
|
||||
thrinfo_t** bli_herk_thrinfo_create_paths( void )
|
||||
{
|
||||
thread->ocomm = ocomm;
|
||||
thread->ocomm_id = ocomm_id;
|
||||
thread->icomm = icomm;
|
||||
thread->icomm_id = icomm_id;
|
||||
thread->n_way = n_way;
|
||||
thread->work_id = work_id;
|
||||
thread->opackm = opackm;
|
||||
thread->ipackm = ipackm;
|
||||
thread->sub_trsm = sub_trsm;
|
||||
}
|
||||
|
||||
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread )
|
||||
{
|
||||
thread->ocomm = &BLIS_SINGLE_COMM;
|
||||
thread->ocomm_id = 0;
|
||||
thread->icomm = &BLIS_SINGLE_COMM;
|
||||
thread->icomm_id = 0;
|
||||
thread->n_way = 1;
|
||||
thread->work_id = 0;
|
||||
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
|
||||
thread->sub_trsm = thread;
|
||||
}
|
||||
|
||||
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
|
||||
thread_comm_t* icomm, dim_t icomm_id,
|
||||
dim_t n_way, dim_t work_id,
|
||||
packm_thrinfo_t* opackm,
|
||||
packm_thrinfo_t* ipackm,
|
||||
trsm_thrinfo_t* sub_trsm )
|
||||
{
|
||||
trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc_intl( sizeof( trsm_thrinfo_t ) );
|
||||
bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id,
|
||||
icomm, icomm_id,
|
||||
n_way, work_id,
|
||||
opackm,
|
||||
ipackm,
|
||||
sub_trsm );
|
||||
return thread;
|
||||
}
|
||||
|
||||
void bli_trsm_thrinfo_free( trsm_thrinfo_t* thread)
|
||||
{
|
||||
if( thread == NULL ) return;
|
||||
|
||||
// Free Communicators
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_free_communicator( thread->ocomm );
|
||||
if( thread->sub_trsm == NULL && thread_am_ichief( thread ) )
|
||||
bli_free_communicator( thread->icomm );
|
||||
|
||||
// Free Sub Thrinfos
|
||||
bli_packm_thrinfo_free( thread->opackm );
|
||||
bli_packm_thrinfo_free( thread->ipackm );
|
||||
bli_trsm_thrinfo_free( thread->sub_trsm );
|
||||
bli_free_intl( thread );
|
||||
|
||||
return;
|
||||
}
|
||||
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num )
|
||||
{
|
||||
for( int i = 0; i < num; i++)
|
||||
bli_trsm_thrinfo_free( threads[i] );
|
||||
bli_free_intl( threads );
|
||||
}
|
||||
|
||||
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
|
||||
{
|
||||
dim_t jc_way = 1;
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" );
|
||||
// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" );
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" );
|
||||
dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" );
|
||||
dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" );
|
||||
#else
|
||||
dim_t jc_way = 1;
|
||||
dim_t kc_way = 1;
|
||||
dim_t ic_way = 1;
|
||||
dim_t jr_way = 1;
|
||||
dim_t ir_way = 1;
|
||||
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
dim_t jc_in = bli_read_nway_from_env( "BLIS_JC_NT" );
|
||||
/*dim_t kc_in = bli_read_nway_from_env( "BLIS_KC_NT" );*/
|
||||
dim_t ic_in = bli_read_nway_from_env( "BLIS_IC_NT" );
|
||||
dim_t jr_in = bli_read_nway_from_env( "BLIS_JR_NT" );
|
||||
dim_t ir_in = bli_read_nway_from_env( "BLIS_IR_NT" );
|
||||
|
||||
if(right_sided) {
|
||||
ic_way = jc_in * ic_in * jr_in;
|
||||
ir_way = ir_in;
|
||||
}
|
||||
else {
|
||||
jc_way = jc_in;
|
||||
jr_way = jr_in * ic_in * ir_in;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
|
||||
assert( global_num_threads != 0 );
|
||||
|
||||
@@ -144,78 +64,77 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
|
||||
dim_t ir_nt = 1;
|
||||
|
||||
|
||||
trsm_thrinfo_t** paths = (trsm_thrinfo_t**) bli_malloc_intl( global_num_threads * sizeof( trsm_thrinfo_t* ) );
|
||||
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
|
||||
|
||||
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
|
||||
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
|
||||
for( int a = 0; a < jc_way; a++ )
|
||||
{
|
||||
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
|
||||
{
|
||||
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
|
||||
for( int b = 0; b < kc_way; b++ )
|
||||
{
|
||||
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
|
||||
{
|
||||
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
|
||||
for( int c = 0; c < ic_way; c++ )
|
||||
{
|
||||
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
|
||||
{
|
||||
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
|
||||
for( int d = 0; d < jr_way; d++ )
|
||||
{
|
||||
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++)
|
||||
{
|
||||
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
|
||||
{
|
||||
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
|
||||
for( int e = 0; e < ir_way; e++ )
|
||||
{
|
||||
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
|
||||
dim_t ir_comm_id = 0;
|
||||
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
|
||||
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
|
||||
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
|
||||
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
|
||||
dim_t global_comm_id = a*jc_nt + jc_comm_id;
|
||||
|
||||
|
||||
// Macrokernel loops
|
||||
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
|
||||
// Macrokernel loops
|
||||
thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id,
|
||||
ir_comm, ir_comm_id,
|
||||
ir_way, e,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id,
|
||||
thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
jr_way, d,
|
||||
NULL, NULL, ir_info);
|
||||
//blk_var_1
|
||||
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
|
||||
packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id,
|
||||
jr_comm, jr_comm_id,
|
||||
ic_nt, ic_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
|
||||
thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
ic_way, c,
|
||||
pack_ic_out, pack_ic_in, jr_info);
|
||||
//blk_var_3
|
||||
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
|
||||
packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
|
||||
ic_comm, ic_comm_id,
|
||||
kc_nt, kc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
|
||||
thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
kc_way, b,
|
||||
pack_kc_out, pack_kc_in, ic_info);
|
||||
//blk_var_2
|
||||
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
|
||||
packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
|
||||
kc_comm, kc_comm_id,
|
||||
jc_nt, jc_comm_id );
|
||||
|
||||
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
|
||||
packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
global_num_threads, global_comm_id );
|
||||
|
||||
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
|
||||
thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id,
|
||||
jc_comm, jc_comm_id,
|
||||
jc_way, a,
|
||||
pack_jc_out, pack_jc_in, kc_info);
|
||||
@@ -228,3 +147,4 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
|
||||
}
|
||||
return paths;
|
||||
}
|
||||
#endif
|
||||
44
frame/3/herk/old/bli_herk_thread.h
Normal file
44
frame/3/herk/old/bli_herk_thread.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#define bli_thrinfo_sub_self( thread ) thread->sub_l3op
|
||||
#define bli_thrinfo_sub_opackm( thread ) thread->opackm
|
||||
#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm
|
||||
|
||||
// For use in herk micro-kernel
|
||||
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
|
||||
//thrinfo_t** bli_herk_thrinfo_create_paths( void );
|
||||
|
||||
@@ -85,11 +85,11 @@ void bli_symm_front( side_t side,
|
||||
bli_obj_swap( a_local, b_local );
|
||||
}
|
||||
|
||||
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_gemm_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -100,7 +100,7 @@ void bli_symm_front( side_t side,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_gemm_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -98,11 +98,11 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
cntl );
|
||||
#else
|
||||
// Invoke herk twice, using beta only the first time.
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -113,7 +113,7 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&b_local,
|
||||
@@ -124,7 +124,7 @@ void bli_syr2k_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@@ -78,11 +78,11 @@ void bli_syrk_front( obj_t* alpha,
|
||||
bli_obj_induce_trans( c_local );
|
||||
}
|
||||
|
||||
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
|
||||
dim_t n_threads = thread_num_threads( infos[0] );
|
||||
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT );
|
||||
dim_t n_threads = bli_thread_num_threads( infos[0] );
|
||||
|
||||
// Invoke the internal back-end.
|
||||
bli_level3_thread_decorator( n_threads,
|
||||
bli_l3_thread_decorator( n_threads,
|
||||
(l3_int_t) bli_herk_int,
|
||||
alpha,
|
||||
&a_local,
|
||||
@@ -93,7 +93,7 @@ void bli_syrk_front( obj_t* alpha,
|
||||
(void*) cntl,
|
||||
(void**) infos );
|
||||
|
||||
bli_herk_thrinfo_free_paths( infos, n_threads );
|
||||
bli_l3_thrinfo_free_paths( infos, n_threads );
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t b_pack_s;
|
||||
obj_t a1_pack_s, c1_pack_s;
|
||||
@@ -55,32 +55,32 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_m( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing B.
|
||||
bli_obj_init_pack( &b_pack_s );
|
||||
bli_packm_init( b, &b_pack_s,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
b_pack = thread_obroadcast( thread, &b_pack_s );
|
||||
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
|
||||
|
||||
// Initialize all pack objects that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &a1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack B (if instructed).
|
||||
bli_packm_int( b, b_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
// Set the default length of and offset to the non-zero part of A.
|
||||
//m_trans = bli_obj_length_after_trans( *a );
|
||||
@@ -96,8 +96,8 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
// bli_obj_width_after_trans( *a );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_t2b( thread, a,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_weighted_t2b( thread, a,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the m dimension.
|
||||
@@ -105,7 +105,7 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, a,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for A1 and C1.
|
||||
bli_acquire_mpart_t2b( BLIS_SUBPART1,
|
||||
@@ -114,23 +114,23 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and C1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack A1 (if instructed).
|
||||
bli_packm_int( &a1, a1_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
@@ -139,24 +139,24 @@ void bli_trmm_blk_var1f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
|
||||
if( thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ){
|
||||
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
@@ -55,35 +55,35 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_r2l( thread, b,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_weighted_r2l( thread, b,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
@@ -91,7 +91,7 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_b( i, my_end, b,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
bli_acquire_mpart_r2l( BLIS_SUBPART1,
|
||||
@@ -100,23 +100,23 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
@@ -125,24 +125,24 @@ void bli_trmm_blk_var2b( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
gemm_t* cntl,
|
||||
trmm_thrinfo_t* thread )
|
||||
thrinfo_t* thread )
|
||||
{
|
||||
obj_t a_pack_s;
|
||||
obj_t b1_pack_s, c1_pack_s;
|
||||
@@ -55,35 +55,35 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
// Prune any zero region that exists along the partitioning dimension.
|
||||
bli_trmm_prune_unref_mparts_n( a, b, c );
|
||||
|
||||
if( thread_am_ochief( thread ) ) {
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Initialize object for packing A
|
||||
bli_obj_init_pack( &a_pack_s );
|
||||
bli_packm_init( a, &a_pack_s,
|
||||
cntx, cntl_sub_packm_a( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ) );
|
||||
|
||||
// Scale C by beta (if instructed).
|
||||
bli_scalm_int( &BLIS_ONE,
|
||||
c,
|
||||
cntx, cntl_sub_scalm( cntl ) );
|
||||
cntx, bli_cntl_sub_scalm( cntl ) );
|
||||
}
|
||||
a_pack = thread_obroadcast( thread, &a_pack_s );
|
||||
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
|
||||
|
||||
// Initialize pack objects for B and C that are passed into packm_init().
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_obj_init_pack( &b1_pack_s );
|
||||
bli_obj_init_pack( &c1_pack_s );
|
||||
}
|
||||
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
|
||||
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
|
||||
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
|
||||
|
||||
// Pack A (if instructed).
|
||||
bli_packm_int( a, a_pack,
|
||||
cntx, cntl_sub_packm_a( cntl ),
|
||||
trmm_thread_sub_opackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_a( cntl ),
|
||||
bli_thrinfo_sub_opackm( thread ) );
|
||||
|
||||
dim_t my_start, my_end;
|
||||
bli_get_range_weighted_l2r( thread, b,
|
||||
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
|
||||
bli_thread_get_range_weighted_l2r( thread, b,
|
||||
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
|
||||
&my_start, &my_end );
|
||||
|
||||
// Partition along the n dimension.
|
||||
@@ -91,7 +91,7 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
{
|
||||
// Determine the current algorithmic blocksize.
|
||||
b_alg = bli_determine_blocksize_f( i, my_end, b,
|
||||
cntl_bszid( cntl ), cntx );
|
||||
bli_cntl_bszid( cntl ), cntx );
|
||||
|
||||
// Acquire partitions for B1 and C1.
|
||||
bli_acquire_mpart_l2r( BLIS_SUBPART1,
|
||||
@@ -100,23 +100,23 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Initialize objects for packing A1 and B1.
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_init( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_init( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
thread_ibarrier( thread );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Pack B1 (if instructed).
|
||||
bli_packm_int( &b1, b1_pack,
|
||||
cntx, cntl_sub_packm_b( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_b( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Pack C1 (if instructed).
|
||||
bli_packm_int( &c1, c1_pack,
|
||||
cntx, cntl_sub_packm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_packm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
|
||||
// Perform trmm subproblem.
|
||||
bli_trmm_int( &BLIS_ONE,
|
||||
@@ -125,24 +125,24 @@ void bli_trmm_blk_var2f( obj_t* a,
|
||||
&BLIS_ONE,
|
||||
c1_pack,
|
||||
cntx,
|
||||
cntl_sub_gemm( cntl ),
|
||||
trmm_thread_sub_trmm( thread ) );
|
||||
thread_ibarrier( thread );
|
||||
bli_cntl_sub_gemm( cntl ),
|
||||
bli_thrinfo_sub_self( thread ) );
|
||||
bli_thread_ibarrier( thread );
|
||||
|
||||
// Unpack C1 (if C1 was packed).
|
||||
bli_unpackm_int( c1_pack, &c1,
|
||||
cntx, cntl_sub_unpackm_c( cntl ),
|
||||
trmm_thread_sub_ipackm( thread ) );
|
||||
cntx, bli_cntl_sub_unpackm_c( cntl ),
|
||||
bli_thrinfo_sub_ipackm( thread ) );
|
||||
}
|
||||
|
||||
// If any packing buffers were acquired within packm, release them back
|
||||
// to the memory manager.
|
||||
thread_obarrier( thread );
|
||||
if( thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
|
||||
if( thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
|
||||
bli_thread_obarrier( thread );
|
||||
if( bli_thread_am_ochief( thread ) )
|
||||
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
|
||||
if( bli_thread_am_ichief( thread ) ) {
|
||||
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
|
||||
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user