Reorganized code, APIs related to multithreading.

Details:
- Reorganized code and renamed files defining APIs related to multithreading.
  All code that is not specific to a particular operation is now located in a
  new directory: frame/thread. Code is now organized, roughly, by the
  namespace to which it belongs (see below).
- Consolidated all operation-specific *_thrinfo_t object types into a single
  thrinfo_t object type. Operation-specific level-3 *_thrinfo_t APIs were
  also consolidated, leaving bli_l3_thrinfo_*() and bli_packm_thrinfo_*()
  functions (aside from a few general purpose bli_thrinfo_*() functions).
- Renamed thread_comm_t object type to thrcomm_t.
- Renamed many of the routines and functions (and macros) for multithreading.
  We now have the following API namespaces:
  - bli_thrinfo_*(): functions related to thrinfo_t objects
  - bli_thrcomm_*(): functions related to thrcomm_t objects.
  - bli_thread_*(): general-purpose functions, such as initialization,
    finalization, and computing ranges. (For now, some macros, such as
    bli_thread_[io]broadcast() and bli_thread_[io]barrier() use the
    bli_thread_ namespace prefix, even though bli_thrinfo_ may be more
    appropriate.)
- Renamed thread-related macros so that they use a bli_ prefix.
- Renamed control tree-related macros so that they use a bli_ prefix (to be
  consistent with the thread-related macros that were also renamed).
- Removed #undef BLIS_SIMD_ALIGN_SIZE from dunnington's bli_kernel.h. This
  #undef was a temporary fix to some macro defaults which were being applied
  in the wrong order, which was recently fixed.
This commit is contained in:
Field G. Van Zee
2016-06-06 13:32:04 -05:00
parent 232530e88f
commit 096895c5d5
171 changed files with 3370 additions and 2769 deletions

View File

@@ -38,7 +38,6 @@
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#undef BLIS_SIMD_ALIGN_SIZE
#define BLIS_SIMD_ALIGN_SIZE 16
// -- Cache blocksizes --

View File

@@ -43,11 +43,11 @@ typedef struct packv_s packv_t;
#define cntl_bmid( cntl ) cntl->bmid
#define cntl_sub_packv( cntl ) cntl->sub_packv
#define cntl_sub_packv_x( cntl ) cntl->sub_packv_x
#define cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1
#define cntl_sub_packv_y( cntl ) cntl->sub_packv_y
#define cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1
#define bli_cntl_sub_packv( cntl ) cntl->sub_packv
#define bli_cntl_sub_packv_x( cntl ) cntl->sub_packv_x
#define bli_cntl_sub_packv_x1( cntl ) cntl->sub_packv_x1
#define bli_cntl_sub_packv_y( cntl ) cntl->sub_packv_y
#define bli_cntl_sub_packv_y1( cntl ) cntl->sub_packv_y1
void bli_packv_cntl_init( void );
void bli_packv_cntl_finalize( void );

View File

@@ -58,7 +58,7 @@ void bli_packv_init
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply alias the object to its packed counterpart.
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
bli_obj_alias_to( *a, *p );
return;
@@ -217,7 +217,7 @@ void bli_packv_release
packv_t* cntl
)
{
if ( !cntl_is_noop( cntl ) )
if ( !bli_cntl_is_noop( cntl ) )
bli_obj_release_pack( p );
}

View File

@@ -80,7 +80,7 @@ void bli_packv_int( obj_t* a,
// First check if we are to skip this operation because the control tree
// is NULL. We return without taking any action because a was already
// aliased to p in packv_init().
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
@@ -114,8 +114,8 @@ void bli_packv_int( obj_t* a,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -39,7 +39,7 @@ struct scalv_s
};
typedef struct scalv_s scalv_t;
#define cntl_sub_scalv( cntl ) cntl->sub_scalv
#define bli_cntl_sub_scalv( cntl ) cntl->sub_scalv
void bli_scalv_cntl_init( void );
void bli_scalv_cntl_finalize( void );

View File

@@ -61,14 +61,14 @@ void bli_scalv_int( obj_t* alpha,
bli_scalv_check( alpha, x );
// First check if we are to skip this operation.
if ( cntl_is_noop( cntl ) ) return;
if ( bli_cntl_is_noop( cntl ) ) return;
// Return early if the alpha scalar equals one.
if ( bli_obj_equals( alpha, &BLIS_ONE ) ) return;
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -39,11 +39,11 @@ struct unpackv_s
};
typedef struct unpackv_s unpackv_t;
#define cntl_sub_unpackv( cntl ) cntl->sub_unpackv
#define cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x
#define cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1
#define cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y
#define cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1
#define bli_cntl_sub_unpackv( cntl ) cntl->sub_unpackv
#define bli_cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x
#define bli_cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1
#define bli_cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y
#define bli_cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1
void bli_unpackv_cntl_init( void );
void bli_unpackv_cntl_finalize( void );

View File

@@ -79,7 +79,7 @@ void bli_unpackv_int( obj_t* p,
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply return.
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
@@ -116,8 +116,8 @@ void bli_unpackv_int( obj_t* p,
// Now we are ready to proceed with the unpacking.
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -57,7 +57,7 @@ typedef void (*FUNCPTR_T)(
dim_t pd_p, inc_t ps_p,
void* packm_ker,
cntx_t* cntx,
packm_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
@@ -96,7 +96,7 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
void bli_packm_blk_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packm_thrinfo_t* t )
thrinfo_t* t )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -156,7 +156,7 @@ void bli_packm_blk_var1( obj_t* c,
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if( thread_am_ochief( t ) )
if( bli_thread_am_ochief( t ) )
{
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
@@ -177,7 +177,7 @@ void bli_packm_blk_var1( obj_t* c,
kappa_p = &BLIS_ONE;
}
}
kappa_p = thread_obroadcast( t, kappa_p );
kappa_p = bli_thread_obroadcast( t, kappa_p );
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
@@ -280,7 +280,7 @@ void PASTEMAC(ch,varname) \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
packm_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ft) packm_ker_cast = packm_ker; \

View File

@@ -35,7 +35,7 @@
void bli_packm_blk_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packm_thrinfo_t* t );
thrinfo_t* t );
#undef GENTPROT
@@ -63,7 +63,7 @@ void PASTEMAC(ch,varname) \
dim_t pd_p, inc_t ps_p, \
void* packm_ker, \
cntx_t* cntx, \
packm_thrinfo_t* thread \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( packm_blk_var1 )

View File

@@ -55,13 +55,13 @@ typedef struct packm_s packm_t;
#define cntl_pack_schema( cntl ) cntl->pack_schema
#define cntl_pack_buf_type( cntl ) cntl->pack_buf_type
#define cntl_sub_packm( cntl ) cntl->sub_packm
#define cntl_sub_packm_a( cntl ) cntl->sub_packm_a
#define cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11
#define cntl_sub_packm_b( cntl ) cntl->sub_packm_b
#define cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11
#define cntl_sub_packm_c( cntl ) cntl->sub_packm_c
#define cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11
#define bli_cntl_sub_packm( cntl ) cntl->sub_packm
#define bli_cntl_sub_packm_a( cntl ) cntl->sub_packm_a
#define bli_cntl_sub_packm_a11( cntl ) cntl->sub_packm_a11
#define bli_cntl_sub_packm_b( cntl ) cntl->sub_packm_b
#define bli_cntl_sub_packm_b11( cntl ) cntl->sub_packm_b11
#define bli_cntl_sub_packm_c( cntl ) cntl->sub_packm_c
#define bli_cntl_sub_packm_c11( cntl ) cntl->sub_packm_c11
void bli_packm_cntl_init( void );
void bli_packm_cntl_finalize( void );

View File

@@ -60,7 +60,7 @@ void bli_packm_init( obj_t* a,
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply alias the object to its packed counterpart.
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
bli_obj_alias_to( *a, *p );
return;
@@ -581,7 +581,7 @@ void bli_packm_init_pack( invdiag_t invert_diag,
void bli_packm_release( obj_t* p,
packm_t* cntl )
{
if ( !cntl_is_noop( cntl ) )
if ( !bli_cntl_is_noop( cntl ) )
bli_obj_release_pack( p );
}

View File

@@ -39,7 +39,7 @@
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* p,
cntx_t* cntx,
packm_thrinfo_t* t );
thrinfo_t* t );
static FUNCPTR_T vars[6][3] =
{
@@ -56,7 +56,7 @@ void bli_packm_int( obj_t* a,
obj_t* p,
cntx_t* cntx,
packm_t* cntl,
packm_thrinfo_t* thread )
thrinfo_t* thread )
{
varnum_t n;
impl_t i;
@@ -73,7 +73,7 @@ void bli_packm_int( obj_t* a,
// First check if we are to skip this operation because the control tree
// is NULL. We return without taking any action because a was already
// aliased to p in packm_init().
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
@@ -115,8 +115,8 @@ void bli_packm_int( obj_t* a,
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
@@ -128,6 +128,6 @@ void bli_packm_int( obj_t* a,
thread );
// Barrier so that packing is done before computation
thread_obarrier( thread );
bli_thread_obarrier( thread );
}

View File

@@ -36,5 +36,5 @@ void bli_packm_int( obj_t* a,
obj_t* p,
cntx_t* cntx,
packm_t* cntl,
packm_thrinfo_t* thread );
thrinfo_t* thread );

View File

@@ -0,0 +1,111 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
thrinfo_t* bli_packm_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id
)
{
thrinfo_t* thread = bli_malloc_intl( sizeof( thrinfo_t ) );
bli_thrinfo_init
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
NULL,
NULL,
NULL
);
return thread;
}
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id
)
{
bli_thrinfo_init
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
NULL,
NULL,
NULL
);
}
void bli_packm_thrinfo_init_single
(
thrinfo_t* thread
)
{
bli_packm_thrinfo_init
(
thread,
&BLIS_SINGLE_COMM, 0,
&BLIS_SINGLE_COMM, 0,
1,
0
);
}
void bli_packm_thrinfo_free
(
thrinfo_t* thread
)
{
if ( thread != NULL &&
thread != &BLIS_PACKM_SINGLE_THREADED )
bli_free_intl( thread );
}

View File

@@ -32,23 +32,44 @@
*/
struct packm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
};
typedef struct packm_thrinfo_s packm_thrinfo_t;
//
// thrinfo_t macros specific to packm.
//
#define packm_thread_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
void bli_packm_thrinfo_free( packm_thrinfo_t* thread );
packm_thrinfo_t* bli_create_packm_thread_info( thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_packm_thread_info( packm_thrinfo_t* thread, thread_comm_t* ocomm, dim_t ocomm_id, thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id );
void bli_setup_packm_single_threaded_info( packm_thrinfo_t* thread );
//
// thrinfo_t APIs specific to packm.
//
thrinfo_t* bli_packm_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id
);
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id
);
void bli_packm_thrinfo_init_single
(
thrinfo_t* thread
);
void bli_packm_thrinfo_free
(
thrinfo_t* thread
);

View File

@@ -58,7 +58,7 @@ static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
void bli_packm_unb_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packm_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_cp = bli_obj_datatype( *c );
@@ -96,7 +96,7 @@ void bli_packm_unb_var1( obj_t* c,
// function pointer.
f = ftypes[dt_cp];
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Invoke the function.
f
(

View File

@@ -35,7 +35,7 @@
void bli_packm_unb_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packm_thrinfo_t* thread );
thrinfo_t* thread );
#undef GENTPROT

View File

@@ -39,7 +39,7 @@ struct scalm_s
};
typedef struct scalm_s scalm_t;
#define cntl_sub_scalm( cntl ) cntl->sub_scalm
#define bli_cntl_sub_scalm( cntl ) cntl->sub_scalm
void bli_scalm_cntl_init( void );
void bli_scalm_cntl_finalize( void );

View File

@@ -64,7 +64,7 @@ void bli_scalm_int( obj_t* alpha,
bli_scalm_check( alpha, x );
// First check if we are to skip this operation.
if ( cntl_is_noop( cntl ) ) return;
if ( bli_cntl_is_noop( cntl ) ) return;
// Return early if both alpha and the scalar attached to x are unit.
if ( bli_obj_equals( alpha, &BLIS_ONE ) &&
@@ -85,8 +85,8 @@ void bli_scalm_int( obj_t* alpha,
//}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -40,13 +40,13 @@ struct unpackm_s
};
typedef struct unpackm_s unpackm_t;
#define cntl_sub_unpackm( cntl ) cntl->sub_unpackm
#define cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a
#define cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11
#define cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b
#define cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11
#define cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c
#define cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11
#define bli_cntl_sub_unpackm( cntl ) cntl->sub_unpackm
#define bli_cntl_sub_unpackm_a( cntl ) cntl->sub_unpackm_a
#define bli_cntl_sub_unpackm_a11( cntl ) cntl->sub_unpackm_a11
#define bli_cntl_sub_unpackm_b( cntl ) cntl->sub_unpackm_b
#define bli_cntl_sub_unpackm_b11( cntl ) cntl->sub_unpackm_b11
#define bli_cntl_sub_unpackm_c( cntl ) cntl->sub_unpackm_c
#define bli_cntl_sub_unpackm_c11( cntl ) cntl->sub_unpackm_c11
void bli_unpackm_cntl_init( void );
void bli_unpackm_cntl_finalize( void );

View File

@@ -52,7 +52,7 @@ void bli_unpackm_int( obj_t* p,
obj_t* a,
cntx_t* cntx,
unpackm_t* cntl,
packm_thrinfo_t* thread )
thrinfo_t* thread )
{
// The unpackm operation consists of an optional post-process: castm.
// (This post-process is analogous to the castm pre-process in packm.)
@@ -77,7 +77,7 @@ void bli_unpackm_int( obj_t* p,
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply return.
if ( cntl_is_noop( cntl ) )
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
@@ -118,20 +118,20 @@ void bli_unpackm_int( obj_t* p,
// Now we are ready to proceed with the unpacking.
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
f( p,
&c,
cntx,
cntl );
}
thread_obarrier( thread );
bli_thread_obarrier( thread );
// Now, if necessary, we cast the contents of c to matrix a. If casting
// was not necessary, then we are done because the call to the unpackm

View File

@@ -36,7 +36,7 @@ void bli_unpackm_int( obj_t* p,
obj_t* a,
cntx_t* cntx,
unpackm_t* cntl,
packm_thrinfo_t* thread );
thrinfo_t* thread );
/*
void bli_unpackm_init_cast( obj_t* p,

View File

@@ -61,7 +61,7 @@ void bli_gemv_blk_var1( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and y1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -71,16 +71,16 @@ void bli_gemv_blk_var1( obj_t* alpha,
// Initialize objects for packing A1 and y1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y( cntl ) );
cntx, bli_cntl_sub_packv_y( cntl ) );
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ),
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y( cntl ) );
cntx, bli_cntl_sub_packv_y( cntl ) );
// y1 = beta * y1 + alpha * A1 * x;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -91,16 +91,16 @@ void bli_gemv_blk_var1( obj_t* alpha,
beta,
&y1_pack,
cntx,
cntl_sub_gemv( cntl ) );
bli_cntl_sub_gemv( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, cntl_sub_unpackv_y( cntl ) );
cntx, bli_cntl_sub_unpackv_y( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y( cntl ) );
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
}

View File

@@ -59,14 +59,14 @@ void bli_gemv_blk_var2( obj_t* alpha,
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition along the "k" dimension (n dimension of A).
for ( i = 0; i < n_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and x1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -76,16 +76,16 @@ void bli_gemv_blk_var2( obj_t* alpha,
// Initialize objects for packing A1 and x1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x( cntl ) );
cntx, bli_cntl_sub_packv_x( cntl ) );
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ),
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x( cntl ) );
cntx, bli_cntl_sub_packv_x( cntl ) );
// y = y + alpha * A1 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -96,12 +96,12 @@ void bli_gemv_blk_var2( obj_t* alpha,
&BLIS_ONE,
y,
cntx,
cntl_sub_gemv( cntl ) );
bli_cntl_sub_gemv( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x( cntl ) );
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
}

View File

@@ -46,13 +46,13 @@ struct gemv_s
};
typedef struct gemv_s gemv_t;
#define cntl_sub_gemv( cntl ) cntl->sub_gemv
#define cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp
#define cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp
#define cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp
#define cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp
#define cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp
#define cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp
#define bli_cntl_sub_gemv( cntl ) cntl->sub_gemv
#define bli_cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp
#define bli_cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp
#define bli_cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp
#define bli_cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp
#define bli_cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp
#define bli_cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp
void bli_gemv_cntl_init( void );
void bli_gemv_cntl_finalize( void );

View File

@@ -88,8 +88,8 @@ void bli_gemv_int( trans_t transa,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -60,7 +60,7 @@ void bli_ger_blk_var1( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and x1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -70,16 +70,16 @@ void bli_ger_blk_var1( obj_t* alpha,
// Initialize objects for packing A1 and x1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x( cntl ) );
cntx, bli_cntl_sub_packv_x( cntl ) );
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ),
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x( cntl ) );
cntx, bli_cntl_sub_packv_x( cntl ) );
// A1 = A1 + alpha * x1 * y;
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -89,17 +89,17 @@ void bli_ger_blk_var1( obj_t* alpha,
y,
&a1_pack,
cntx,
cntl_sub_ger( cntl ) );
bli_cntl_sub_ger( cntl ) );
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntx, cntl_sub_unpackm_a( cntl ),
cntx, bli_cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x( cntl ) );
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
}

View File

@@ -60,7 +60,7 @@ void bli_ger_blk_var2( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and y1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -70,16 +70,16 @@ void bli_ger_blk_var2( obj_t* alpha,
// Initialize objects for packing A1 and y1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y( cntl ) );
cntx, bli_cntl_sub_packv_y( cntl ) );
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, cntl_sub_packm_a( cntl ),
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y( cntl ) );
cntx, bli_cntl_sub_packv_y( cntl ) );
// A1 = A1 + alpha * x * y1;
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -89,17 +89,17 @@ void bli_ger_blk_var2( obj_t* alpha,
&y1_pack,
&a1_pack,
cntx,
cntl_sub_ger( cntl ) );
bli_cntl_sub_ger( cntl ) );
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntx, cntl_sub_unpackm_a( cntl ),
cntx, bli_cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y( cntl ) );
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
}

View File

@@ -45,9 +45,9 @@ struct ger_s
};
typedef struct ger_s ger_t;
#define cntl_sub_ger( cntl ) cntl->sub_ger
#define cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp
#define cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp
#define bli_cntl_sub_ger( cntl ) cntl->sub_ger
#define bli_cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp
#define bli_cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp
void bli_ger_cntl_init( void );
void bli_ger_cntl_finalize( void );

View File

@@ -107,15 +107,15 @@ void bli_ger_int( conj_t conjx,
// If we are about the call a leaf-level implementation, and matrix A
// still needs a transposition, then we must induce one by swapping the
// strides and dimensions.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( a_local ) )
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( a_local ) )
{
bli_obj_induce_trans( a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, a_local );
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -74,14 +74,14 @@ void bli_hemv_blk_var1( conj_t conjh,
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -99,20 +99,20 @@ void bli_hemv_blk_var1( conj_t conjh,
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y0 = y0 + alpha * A10' * x1;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
@@ -123,7 +123,7 @@ void bli_hemv_blk_var1( conj_t conjh,
&BLIS_ONE,
&y0,
cntx,
cntl_sub_gemv_t_rp( cntl ) );
bli_cntl_sub_gemv_t_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
@@ -133,7 +133,7 @@ void bli_hemv_blk_var1( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_hemv( cntl ) );
bli_cntl_sub_hemv( cntl ) );
// y1 = y1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -144,17 +144,17 @@ void bli_hemv_blk_var1( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_gemv_n_rp( cntl ) );
bli_cntl_sub_gemv_n_rp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, cntl_sub_unpackv_y1( cntl ) );
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -75,14 +75,14 @@ void bli_hemv_blk_var2( conj_t conjh,
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, A21, x1, x0, x2, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -102,20 +102,20 @@ void bli_hemv_blk_var2( conj_t conjh,
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y1 = y1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -126,7 +126,7 @@ void bli_hemv_blk_var2( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_gemv_n_rp( cntl ) );
bli_cntl_sub_gemv_n_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
@@ -136,7 +136,7 @@ void bli_hemv_blk_var2( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_hemv( cntl ) );
bli_cntl_sub_hemv( cntl ) );
// y1 = y1 + alpha * A21' * x2;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
@@ -147,17 +147,17 @@ void bli_hemv_blk_var2( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_gemv_t_cp( cntl ) );
bli_cntl_sub_gemv_t_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, cntl_sub_unpackv_y1( cntl ) );
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -74,14 +74,14 @@ void bli_hemv_blk_var3( conj_t conjh,
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -99,20 +99,20 @@ void bli_hemv_blk_var3( conj_t conjh,
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y1 = y1 + alpha * A21' * x2;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
@@ -123,7 +123,7 @@ void bli_hemv_blk_var3( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_gemv_t_cp( cntl ) );
bli_cntl_sub_gemv_t_cp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
@@ -133,7 +133,7 @@ void bli_hemv_blk_var3( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_hemv( cntl ) );
bli_cntl_sub_hemv( cntl ) );
// y2 = y2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -144,17 +144,17 @@ void bli_hemv_blk_var3( conj_t conjh,
&BLIS_ONE,
&y2,
cntx,
cntl_sub_gemv_n_cp( cntl ) );
bli_cntl_sub_gemv_n_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, cntl_sub_unpackv_y1( cntl ) );
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -75,14 +75,14 @@ void bli_hemv_blk_var4( conj_t conjh,
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, A21, x1, y1, y0, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -102,20 +102,20 @@ void bli_hemv_blk_var4( conj_t conjh,
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y0 = y0 + alpha * A10' * x1;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
@@ -126,7 +126,7 @@ void bli_hemv_blk_var4( conj_t conjh,
&BLIS_ONE,
&y0,
cntx,
cntl_sub_gemv_t_rp( cntl ) );
bli_cntl_sub_gemv_t_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
@@ -136,7 +136,7 @@ void bli_hemv_blk_var4( conj_t conjh,
&BLIS_ONE,
&y1_pack,
cntx,
cntl_sub_hemv( cntl ) );
bli_cntl_sub_hemv( cntl ) );
// y2 = y2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -147,17 +147,17 @@ void bli_hemv_blk_var4( conj_t conjh,
&BLIS_ONE,
&y2,
cntx,
cntl_sub_gemv_n_cp( cntl ) );
bli_cntl_sub_gemv_n_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, cntl_sub_unpackv_y1( cntl ) );
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -50,7 +50,7 @@ struct hemv_s
};
typedef struct hemv_s hemv_t;
#define cntl_sub_hemv( cntl ) cntl->sub_hemv
#define bli_cntl_sub_hemv( cntl ) cntl->sub_hemv
void bli_hemv_cntl_init( void );
void bli_hemv_cntl_finalize( void );

View File

@@ -96,7 +96,7 @@ void bli_hemv_int( conj_t conjh,
// triangular case. But we only need to do this for blocked algorithms,
// since unblocked algorithms are responsible for handling the upper case
// explicitly (and they should not be inspecting the transposition bit anyway).
if ( cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) )
if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( *a ) )
{
bli_obj_toggle_conj( a_local );
bli_obj_toggle_trans( a_local );
@@ -104,8 +104,8 @@ void bli_hemv_int( conj_t conjh,
*/
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -71,7 +71,7 @@ void bli_her_blk_var1( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, x1, and x0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -85,16 +85,16 @@ void bli_her_blk_var1( conj_t conjh,
// Initialize objects for packing C11 and x1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// C10 = C10 + alpha * x1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -104,7 +104,7 @@ void bli_her_blk_var1( conj_t conjh,
&x0,
&c10,
cntx,
cntl_sub_ger( cntl ) );
bli_cntl_sub_ger( cntl ) );
// C11 = C11 + alpha * x1 * x1';
bli_her_int( conjh,
@@ -112,17 +112,17 @@ void bli_her_blk_var1( conj_t conjh,
&x1_pack,
&c11_pack,
cntx,
cntl_sub_her( cntl ) );
bli_cntl_sub_her( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -71,7 +71,7 @@ void bli_her_blk_var2( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C21, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -85,16 +85,16 @@ void bli_her_blk_var2( conj_t conjh,
// Initialize objects for packing C11 and x1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// C21 = C21 + alpha * x2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -104,7 +104,7 @@ void bli_her_blk_var2( conj_t conjh,
&x1_pack,
&c21,
cntx,
cntl_sub_ger( cntl ) );
bli_cntl_sub_ger( cntl ) );
// C11 = C11 + alpha * x1 * x1';
bli_her_int( conjh,
@@ -112,17 +112,17 @@ void bli_her_blk_var2( conj_t conjh,
&x1_pack,
&c11_pack,
cntx,
cntl_sub_her( cntl ) );
bli_cntl_sub_her( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -45,7 +45,7 @@ struct her_s
};
typedef struct her_s her_t;
#define cntl_sub_her( cntl ) cntl->sub_her
#define bli_cntl_sub_her( cntl ) cntl->sub_her
void bli_her_cntl_init( void );
void bli_her_cntl_finalize( void );

View File

@@ -92,8 +92,8 @@ void bli_her_int( conj_t conjh,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -76,7 +76,7 @@ void bli_her2_blk_var1( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -94,20 +94,20 @@ void bli_her2_blk_var1( conj_t conjh,
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + alpha * x1 * y0';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -117,7 +117,7 @@ void bli_her2_blk_var1( conj_t conjh,
&y0,
&c10,
cntx,
cntl_sub_ger_rp( cntl ) );
bli_cntl_sub_ger_rp( cntl ) );
// C10 = C10 + conj(alpha) * y1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -127,7 +127,7 @@ void bli_her2_blk_var1( conj_t conjh,
&x0,
&c10,
cntx,
cntl_sub_ger_rp( cntl ) );
bli_cntl_sub_ger_rp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
@@ -137,18 +137,18 @@ void bli_her2_blk_var1( conj_t conjh,
&y1_pack,
&c11_pack,
cntx,
cntl_sub_her2( cntl ) );
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -77,7 +77,7 @@ void bli_her2_blk_var2( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, C21, x1, x0, x2, and y1.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -97,20 +97,20 @@ void bli_her2_blk_var2( conj_t conjh,
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + conj(alpha) * y1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -120,7 +120,7 @@ void bli_her2_blk_var2( conj_t conjh,
&x0,
&c10,
cntx,
cntl_sub_ger_rp( cntl ) );
bli_cntl_sub_ger_rp( cntl ) );
// C21 = C21 + alpha * x2 * y1';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -130,7 +130,7 @@ void bli_her2_blk_var2( conj_t conjh,
&y1_pack,
&c21,
cntx,
cntl_sub_ger_cp( cntl ) );
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
@@ -140,18 +140,18 @@ void bli_her2_blk_var2( conj_t conjh,
&y1_pack,
&c11_pack,
cntx,
cntl_sub_her2( cntl ) );
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -77,7 +77,7 @@ void bli_her2_blk_var3( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, C21, x1, y1, y0, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -97,20 +97,20 @@ void bli_her2_blk_var3( conj_t conjh,
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + alpha * x1 * y0';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -120,7 +120,7 @@ void bli_her2_blk_var3( conj_t conjh,
&y0,
&c10,
cntx,
cntl_sub_ger_rp( cntl ) );
bli_cntl_sub_ger_rp( cntl ) );
// C21 = C21 + conj(alpha) * y2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -130,7 +130,7 @@ void bli_her2_blk_var3( conj_t conjh,
&x1_pack,
&c21,
cntx,
cntl_sub_ger_cp( cntl ) );
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
@@ -140,18 +140,18 @@ void bli_her2_blk_var3( conj_t conjh,
&y1_pack,
&c11_pack,
cntx,
cntl_sub_her2( cntl ) );
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -76,7 +76,7 @@ void bli_her2_blk_var4( conj_t conjh,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C21, x1, x2, y1, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -94,20 +94,20 @@ void bli_her2_blk_var4( conj_t conjh,
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ) );
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, cntl_sub_packm_c11( cntl ),
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, cntl_sub_packv_y1( cntl ) );
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C21 = C21 + alpha * x2 * y1';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -117,7 +117,7 @@ void bli_her2_blk_var4( conj_t conjh,
&y1_pack,
&c21,
cntx,
cntl_sub_ger_cp( cntl ) );
bli_cntl_sub_ger_cp( cntl ) );
// C21 = C21 + conj(alpha) * y2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
@@ -127,7 +127,7 @@ void bli_her2_blk_var4( conj_t conjh,
&x1_pack,
&c21,
cntx,
cntl_sub_ger_cp( cntl ) );
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
@@ -137,18 +137,18 @@ void bli_her2_blk_var4( conj_t conjh,
&y1_pack,
&c11_pack,
cntx,
cntl_sub_her2( cntl ) );
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, cntl_sub_unpackm_c11( cntl ),
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, cntl_sub_packv_y1( cntl ) );
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}

View File

@@ -47,7 +47,7 @@ struct her2_s
};
typedef struct her2_s her2_t;
#define cntl_sub_her2( cntl ) cntl->sub_her2
#define bli_cntl_sub_her2( cntl ) cntl->sub_her2
void bli_her2_cntl_init( void );
void bli_her2_cntl_finalize( void );

View File

@@ -115,8 +115,8 @@ void bli_her2_int( conj_t conjh,
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -46,7 +46,7 @@ struct trmv_s
};
typedef struct trmv_s trmv_t;
#define cntl_sub_trmv( cntl ) cntl->sub_trmv
#define bli_cntl_sub_trmv( cntl ) cntl->sub_trmv
void bli_trmv_cntl_init( void );
void bli_trmv_cntl_finalize( void );

View File

@@ -115,8 +115,8 @@ void bli_trmv_int( obj_t* alpha,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];

View File

@@ -61,7 +61,7 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, and x0.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
@@ -75,23 +75,23 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = alpha * tril( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trmv( cntl ) );
bli_cntl_sub_trmv( cntl ) );
// x1 = x1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -102,16 +102,16 @@ void bli_trmv_l_blk_var1( obj_t* alpha,
&BLIS_ONE,
&x1_pack,
cntx,
cntl_sub_gemv_rp( cntl ) );
bli_cntl_sub_gemv_rp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -61,7 +61,7 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
@@ -75,16 +75,16 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x2 = x2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -95,23 +95,23 @@ void bli_trmv_l_blk_var2( obj_t* alpha,
&BLIS_ONE,
&x2,
cntx,
cntl_sub_gemv_cp( cntl ) );
bli_cntl_sub_gemv_cp( cntl ) );
// x1 = alpha * tril( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trmv( cntl ) );
bli_cntl_sub_trmv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -61,7 +61,7 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A12, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -75,23 +75,23 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = alpha * triu( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trmv( cntl ) );
bli_cntl_sub_trmv( cntl ) );
// x1 = x1 + alpha * A12 * x2;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -102,16 +102,16 @@ void bli_trmv_u_blk_var1( obj_t* alpha,
&BLIS_ONE,
&x1_pack,
cntx,
cntl_sub_gemv_rp( cntl ) );
bli_cntl_sub_gemv_rp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -61,7 +61,7 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
@@ -75,16 +75,16 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x0 = x0 + alpha * A01 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -95,23 +95,23 @@ void bli_trmv_u_blk_var2( obj_t* alpha,
&BLIS_ONE,
&x0,
cntx,
cntl_sub_gemv_cp( cntl ) );
bli_cntl_sub_gemv_cp( cntl ) );
// x1 = alpha * triu( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trmv( cntl ) );
bli_cntl_sub_trmv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -47,7 +47,7 @@ struct trsv_s
};
typedef struct trsv_s trsv_t;
#define cntl_sub_trsv( cntl ) cntl->sub_trsv
#define bli_cntl_sub_trsv( cntl ) cntl->sub_trsv
void bli_trsv_cntl_init( void );
void bli_trsv_cntl_finalize( void );

View File

@@ -115,8 +115,8 @@ void bli_trsv_int( obj_t* alpha,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];

View File

@@ -59,14 +59,14 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, and x0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -80,16 +80,16 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 - A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -100,23 +100,23 @@ void bli_trsv_l_blk_var1( obj_t* alpha,
&BLIS_ONE,
&x1_pack,
cntx,
cntl_sub_gemv_rp( cntl ) );
bli_cntl_sub_gemv_rp( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trsv( cntl ) );
bli_cntl_sub_trsv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -59,14 +59,14 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
@@ -80,23 +80,23 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trsv( cntl ) );
bli_cntl_sub_trsv( cntl ) );
// x2 = x2 - A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -107,16 +107,16 @@ void bli_trsv_l_blk_var2( obj_t* alpha,
&BLIS_ONE,
&x2,
cntx,
cntl_sub_gemv_cp( cntl ) );
bli_cntl_sub_gemv_cp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -59,14 +59,14 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A12, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
@@ -80,16 +80,16 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 - A12 * x2;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -100,23 +100,23 @@ void bli_trsv_u_blk_var1( obj_t* alpha,
&BLIS_ONE,
&x1_pack,
cntx,
cntl_sub_gemv_rp( cntl ) );
bli_cntl_sub_gemv_rp( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trsv( cntl ) );
bli_cntl_sub_trsv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -59,14 +59,14 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, cntl_sub_scalv( cntl ) );
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A01, x1, and x0.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
@@ -80,23 +80,23 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ) );
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, cntl_sub_packm_a11( cntl ),
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, cntl_sub_packv_x1( cntl ) );
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
cntl_sub_trsv( cntl ) );
bli_cntl_sub_trsv( cntl ) );
// x0 = x0 - A01 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
@@ -107,16 +107,16 @@ void bli_trsv_u_blk_var2( obj_t* alpha,
&BLIS_ONE,
&x0,
cntx,
cntl_sub_gemv_cp( cntl ) );
bli_cntl_sub_gemv_cp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, cntl_sub_unpackv_x1( cntl ) );
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, cntl_sub_packv_x1( cntl ) );
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}

View File

@@ -66,3 +66,4 @@
#include "bli_trmm.h"
#include "bli_trmm3.h"
#include "bli_trsm.h"

336
frame/3/bli_l3_thrinfo.c Normal file
View File

@@ -0,0 +1,336 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
thrinfo_t* bli_l3_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* opackm,
thrinfo_t* ipackm,
thrinfo_t* sub_self
)
{
return bli_thrinfo_create
(
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
opackm,
ipackm,
sub_self
);
}
void bli_l3_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* opackm,
thrinfo_t* ipackm,
thrinfo_t* sub_self
)
{
bli_thrinfo_init
(
thread,
ocomm, ocomm_id,
icomm, icomm_id,
n_way,
work_id,
opackm,
ipackm,
sub_self
);
}
void bli_l3_thrinfo_init_single
(
thrinfo_t* thread
)
{
bli_thrinfo_init_single( thread );
}
void bli_l3_thrinfo_free
(
thrinfo_t* thread
)
{
if ( thread == NULL ||
thread == &BLIS_GEMM_SINGLE_THREADED ||
thread == &BLIS_HERK_SINGLE_THREADED
) return;
// Free Communicators
if ( bli_thread_am_ochief( thread ) )
bli_thrcomm_free( thread->ocomm );
if ( bli_thrinfo_sub_self( thread ) == NULL && bli_thread_am_ichief( thread ) )
bli_thrcomm_free( thread->icomm );
// Free thrinfo chidren
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_l3_thrinfo_free( thread->sub_self );
bli_free_intl( thread );
}
// -----------------------------------------------------------------------------
thrinfo_t** bli_l3_thrinfo_create_paths
(
opid_t l3_op,
side_t side
)
{
dim_t jc_in, jc_way;
dim_t kc_in, kc_way;
dim_t ic_in, ic_way;
dim_t jr_in, jr_way;
dim_t ir_in, ir_way;
#ifdef BLIS_ENABLE_MULTITHREADING
jc_in = bli_env_read_nway( "BLIS_JC_NT" );
//kc_way = bli_env_read_nway( "BLIS_KC_NT" );
kc_in = 1;
ic_in = bli_env_read_nway( "BLIS_IC_NT" );
jr_in = bli_env_read_nway( "BLIS_JR_NT" );
ir_in = bli_env_read_nway( "BLIS_IR_NT" );
#else
jc_in = 1;
kc_in = 1;
ic_in = 1;
jr_in = 1;
ir_in = 1;
#endif
if ( l3_op == BLIS_TRMM )
{
// We reconfigure the parallelism for trmm_r due to a dependency in
// the jc loop. (NOTE: This dependency does not exist for trmm3.)
if ( bli_is_right( side ) )
{
jc_way = 1;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in * jc_in;
ir_way = ir_in;
}
else // if ( bli_is_left( side ) )
{
jc_way = jc_in;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in;
ir_way = ir_in;
}
}
else if ( l3_op == BLIS_TRSM )
{
if ( bli_is_right( side ) )
{
jc_way = 1;
kc_way = 1;
ic_way = jc_in * ic_in * jr_in;
jr_way = 1;
ir_way = 1;
}
else // if ( bli_is_left( side ) )
{
jc_way = 1;
kc_way = 1;
ic_way = 1;
jr_way = ic_in * jr_in * ir_in;
ir_way = 1;
}
}
else // all other level-3 operations
{
jc_way = jc_in;
kc_way = kc_in;
ic_way = ic_in;
jr_way = jr_in;
ir_way = ir_in;
}
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
dim_t jc_nt = kc_way * ic_way * jr_way * ir_way;
dim_t kc_nt = ic_way * jr_way * ir_way;
dim_t ic_nt = jr_way * ir_way;
dim_t jr_nt = ir_way;
dim_t ir_nt = 1;
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
for( int e = 0; e < ir_way; e++ )
{
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
thrinfo_t* ir_info
=
bli_l3_thrinfo_create( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL );
thrinfo_t* jr_info
=
bli_l3_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info );
//blk_var_1
thrinfo_t* pack_ic_in
=
bli_packm_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
thrinfo_t* pack_ic_out
=
bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
thrinfo_t* ic_info
=
bli_l3_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
pack_ic_out, pack_ic_in, jr_info );
//blk_var_3
thrinfo_t* pack_kc_in
=
bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
thrinfo_t* pack_kc_out
=
bli_packm_thrinfo_create( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
thrinfo_t* kc_info
=
bli_l3_thrinfo_create( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
pack_kc_out, pack_kc_in, ic_info );
//blk_var_2
thrinfo_t* pack_jc_in
=
bli_packm_thrinfo_create( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
thrinfo_t* pack_jc_out
=
bli_packm_thrinfo_create( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
thrinfo_t* jc_info
=
bli_l3_thrinfo_create( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
pack_jc_out, pack_jc_in, kc_info );
paths[global_comm_id] = jc_info;
}
}
}
}
}
return paths;
}
void bli_l3_thrinfo_free_paths
(
thrinfo_t** threads,
dim_t num
)
{
dim_t i;
for ( i = 0; i < num; ++i )
bli_l3_thrinfo_free( threads[i] );
bli_free_intl( threads );
}

View File

@@ -32,48 +32,83 @@
*/
//
// thrinfo_t macros specific to various level-3 operations.
//
struct trmm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
// gemm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct trmm_thrinfo_s* sub_trmm;
};
typedef struct trmm_thrinfo_s trmm_thrinfo_t;
// herk
#define trmm_thread_sub_trmm( thread ) thread->sub_trmm
#define trmm_thread_sub_opackm( thread ) thread->opackm
#define trmm_thread_sub_ipackm( thread ) thread->ipackm
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
// trmm
#define trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
trmm_thrinfo_t** bli_create_trmm_thrinfo_paths( bool_t jc_dependency );
void bli_trmm_thrinfo_free_paths( trmm_thrinfo_t** info, dim_t n_threads );
// trsm
void bli_setup_trmm_thrinfo_node( trmm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
#define trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
trmm_thrinfo_t* bli_create_trmm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trmm_thrinfo_t* sub_trmm );
//
// thrinfo_t APIs specific to level-3 operations.
//
thrinfo_t* bli_l3_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* opackm,
thrinfo_t* ipackm,
thrinfo_t* sub_self
);
void bli_l3_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
thrcomm_t* icomm,
dim_t icomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* opackm,
thrinfo_t* ipackm,
thrinfo_t* sub_self
);
void bli_l3_thrinfo_init_single
(
thrinfo_t* thread
);
void bli_l3_thrinfo_free
(
thrinfo_t* thread
);
// -----------------------------------------------------------------------------
thrinfo_t** bli_l3_thrinfo_create_paths
(
opid_t l3_op,
side_t side
);
void bli_l3_thrinfo_free_paths
(
thrinfo_t** threads,
dim_t num
);
void bli_setup_trmm_single_threaded_info( trmm_thrinfo_t* thread );

View File

@@ -39,7 +39,7 @@ void bli_gemm_blk_var1f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
//The s is for "lives on the stack"
obj_t b_pack_s;
@@ -53,36 +53,36 @@ void bli_gemm_blk_var1f( obj_t* a,
dim_t i;
dim_t b_alg;
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
// Initialize objects passed into bli_packm_init for A and C
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntx, cntl_sub_packm_b( cntl ),
gemm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_t2b( thread, a,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_t2b( thread, a,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the m dimension.
@@ -93,7 +93,7 @@ void bli_gemm_blk_var1f( obj_t* a,
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -102,23 +102,23 @@ void bli_gemm_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
@@ -127,26 +127,26 @@ void bli_gemm_blk_var1f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_gemm_blk_var2f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
@@ -53,35 +53,35 @@ void bli_gemm_blk_var2f( obj_t* a,
dim_t b_alg;
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntx, cntl_sub_packm_a( cntl ),
gemm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_l2r( thread, b,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_l2r( thread, b,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the n dimension.
@@ -92,7 +92,7 @@ void bli_gemm_blk_var2f( obj_t* a,
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, my_end, b,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for B1 and C1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -101,23 +101,23 @@ void bli_gemm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
@@ -126,26 +126,26 @@ void bli_gemm_blk_var2f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_gemm_blk_var3f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t c_pack_s;
obj_t a1_pack_s, b1_pack_s;
@@ -53,31 +53,31 @@ void bli_gemm_blk_var3f( obj_t* a,
dim_t b_alg;
dim_t k_trans;
if( thread_am_ochief( thread ) ){
if( bli_thread_am_ochief( thread ) ){
// Initialize object for packing C
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
c_pack = bli_thread_obroadcast( thread, &c_pack_s );
// Initialize pack objects for A and B that are passed into packm_init().
if( thread_am_ichief( thread ) ){
if( bli_thread_am_ichief( thread ) ){
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &b1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntx, cntl_sub_packm_c( cntl ),
gemm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
@@ -90,7 +90,7 @@ void bli_gemm_blk_var3f( obj_t* a,
// the kc blocksize so that we can implement the "nudging" of kc
// to be a multiple of mr or nr, as needed.
b_alg = bli_gemm_determine_kc_f( i, k_trans, a, b,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and B1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -99,23 +99,23 @@ void bli_gemm_blk_var3f( obj_t* a,
i, b_alg, b, &b1 );
// Initialize objects for packing A1 and B1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem.
bli_gemm_int( &BLIS_ONE,
@@ -124,8 +124,8 @@ void bli_gemm_blk_var3f( obj_t* a,
&BLIS_ONE,
c_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
@@ -133,25 +133,25 @@ void bli_gemm_blk_var3f( obj_t* a,
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
bli_thread_ibarrier( thread );
if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}
thread_obarrier( thread );
bli_thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( c_pack, c,
cntx, cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ochief( thread ) )
bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) );
if( thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) );
if( bli_thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
}
}

View File

@@ -46,7 +46,7 @@ struct gemm_s
};
typedef struct gemm_s gemm_t;
#define cntl_sub_gemm( cntl ) cntl->sub_gemm
#define bli_cntl_sub_gemm( cntl ) cntl->sub_gemm
void bli_gemm_cntl_init( void );
void bli_gemm_cntl_finalize( void );

View File

@@ -79,11 +79,11 @@ void bli_gemm_front( obj_t* alpha,
bli_obj_induce_trans( c_local );
}
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_GEMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_gemm_int,
alpha,
&a_local,
@@ -94,7 +94,7 @@ void bli_gemm_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
thrinfo_t* thread );
static FUNCPTR_T vars[6][3] =
{
@@ -61,7 +61,7 @@ void bli_gemm_int( obj_t* alpha,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_local;
obj_t b_local;
@@ -82,9 +82,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *b ) )
{
if( thread_am_ochief( thread ) )
if( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
bli_thread_obarrier( thread );
return;
}
@@ -93,9 +93,9 @@ void bli_gemm_int( obj_t* alpha,
if ( bli_obj_is_zeros( *a ) ||
bli_obj_is_zeros( *b ) )
{
if( thread_am_ochief( thread ) )
if( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
bli_thread_obarrier( thread );
return;
}
@@ -111,9 +111,9 @@ void bli_gemm_int( obj_t* alpha,
// strides and dimensions. Note that this transposition would normally
// be handled explicitly in the packing of C, but if C is not being
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
//if( thread_am_ochief( thread ) ) {
//if( bli_thread_am_ochief( thread ) ) {
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
// }
@@ -134,8 +134,8 @@ void bli_gemm_int( obj_t* alpha,
}
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];

View File

@@ -39,5 +39,5 @@ void bli_gemm_int( obj_t* alpha,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
thrinfo_t* thread );

View File

@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
gemm_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
@@ -61,7 +61,7 @@ void bli_gemm_ker_var2( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -236,11 +236,11 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, aux ); \
bli_auxinfo_set_is_b( is_b, aux ); \
\
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \

View File

@@ -1,78 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct gemm_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct gemm_thrinfo_s* sub_gemm;
};
typedef struct gemm_thrinfo_s gemm_thrinfo_t;
#define gemm_thread_sub_gemm( thread ) thread->sub_gemm
#define gemm_thread_sub_opackm( thread ) thread->opackm
#define gemm_thread_sub_ipackm( thread ) thread->ipackm
// For use in gemm micro-kernel
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
gemm_thrinfo_t** bli_create_gemm_thrinfo_paths( );
void bli_gemm_thrinfo_free_paths( gemm_thrinfo_t**, dim_t n_threads );
void bli_setup_gemm_thrinfo_node( gemm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm );
gemm_thrinfo_t* bli_create_gemm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
gemm_thrinfo_t* sub_gemm );
void bli_setup_gemm_single_threaded_info( gemm_thrinfo_t* thread );

View File

@@ -47,7 +47,7 @@ void PASTEMAC0(opname) \
obj_t* c, \
cntx_t* cntx, \
gemm_t* cntl, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
);
GENPROT( gemm_blk_var1f )
@@ -84,7 +84,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( gemm_ker_var2 )

View File

@@ -39,7 +39,7 @@ void bli_gemm_blk_var4f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
//The s is for "lives on the stack"
obj_t b_pack_s;
@@ -58,37 +58,37 @@ void bli_gemm_blk_var4f( obj_t* a,
cntx_t cntx_io = *cntx;
cntx_t cntx_rpi = *cntx;
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by
// chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
// Initialize objects passed into bli_packm_init for A and C
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntx, cntl_sub_packm_b( cntl ),
gemm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_t2b( thread, a,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_t2b( thread, a,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the m dimension.
@@ -99,7 +99,7 @@ void bli_gemm_blk_var4f( obj_t* a,
// This causes the right blocksize to be used if c and a are
// complex and b is real.
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -112,23 +112,23 @@ void bli_gemm_blk_var4f( obj_t* a,
bli_gemm3m3_cntx_stage( 0, &cntx_ro );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
&cntx_ro, cntl_sub_packm_a( cntl ) );
&cntx_ro, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
&cntx_ro, cntl_sub_packm_c( cntl ) );
&cntx_ro, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
&cntx_ro, cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
&cntx_ro, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
&cntx_ro, cntl_sub_packm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
&cntx_ro, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem (real-only).
bli_gemm_int( &BLIS_ONE,
@@ -137,30 +137,30 @@ void bli_gemm_blk_var4f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Only apply beta within the first of three subproblems.
if ( thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack );
if ( bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c1_pack );
// Initialize the context for the imag-only stage.
bli_gemm3m3_cntx_stage( 1, &cntx_io );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
&cntx_io, cntl_sub_packm_a( cntl ) );
&cntx_io, bli_cntl_sub_packm_a( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
&cntx_io, cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
&cntx_io, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem (imag-only).
bli_gemm_int( &BLIS_ONE,
@@ -169,26 +169,26 @@ void bli_gemm_blk_var4f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Initialize the context for the real+imag stage.
bli_gemm3m3_cntx_stage( 2, &cntx_rpi );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
&cntx_rpi, cntl_sub_packm_a( cntl ) );
&cntx_rpi, bli_cntl_sub_packm_a( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
&cntx_rpi, cntl_sub_packm_a( cntl ),
gemm_thread_sub_ipackm( thread ) );
&cntx_rpi, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform gemm subproblem (real+imag).
bli_gemm_int( &BLIS_ONE,
@@ -197,30 +197,30 @@ void bli_gemm_blk_var4f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
gemm_thread_sub_gemm( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
// Currently must be done by 1 thread
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
gemm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ){
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ichief( thread ) ){
// It doesn't matter which packm cntl node we pass in, as long
// as it is valid, packm_release() will release the mem_t entry
// stored in a1_pack.
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -37,5 +37,5 @@ void bli_gemm_blk_var4f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
thrinfo_t* thread );

View File

@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
gemm_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var3);
@@ -61,7 +61,7 @@ void bli_gemm_ker_var3( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -238,11 +238,11 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, aux ); \
bli_auxinfo_set_is_b( is_b, aux ); \
\
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \

View File

@@ -41,7 +41,7 @@ void bli_gemm_ker_var3( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
thrinfo_t* thread );
//
@@ -65,7 +65,7 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
void* gemm_ukr, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( gemm_ker_var3 )

View File

@@ -50,7 +50,7 @@ typedef void (*FUNCPTR_T)(
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
gemm_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var4);
@@ -61,7 +61,7 @@ void bli_gemm_ker_var4( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -146,7 +146,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -238,11 +238,11 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, aux ); \
bli_auxinfo_set_is_b( is_b, aux ); \
\
gemm_thrinfo_t* caucus = gemm_thread_sub_gemm( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \

View File

@@ -41,7 +41,7 @@ void bli_gemm_ker_var4( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
thrinfo_t* thread );
//
@@ -65,7 +65,7 @@ void PASTEMAC(ch,varname)( \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
void* gemm_ukr, \
gemm_thrinfo_t* thread \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( gemm_ker_var4 )

View File

@@ -35,98 +35,25 @@
#include "blis.h"
#include "assert.h"
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_herk = sub_herk;
}
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_herk = thread;
}
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk )
{
herk_thrinfo_t* thread = ( herk_thrinfo_t* ) bli_malloc_intl( sizeof( herk_thrinfo_t ) );
bli_setup_herk_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_herk );
return thread;
}
void bli_herk_thrinfo_free( herk_thrinfo_t* thread)
{
if( thread == NULL ) return;
// Free Communicators
if( thread_am_ochief( thread ) )
bli_free_communicator( thread->ocomm );
if( thread->sub_herk == NULL && thread_am_ichief( thread ) )
bli_free_communicator( thread->icomm );
// Free Sub Thrinfos
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_herk_thrinfo_free( thread->sub_herk );
bli_free_intl( thread );
return;
}
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** threads, dim_t num )
{
for( int i = 0; i < num; i++)
bli_herk_thrinfo_free( threads[i] );
bli_free_intl( threads );
}
herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
#if 0
thrinfo_t** bli_gemm_thrinfo_create_paths( void )
{
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_way = bli_read_nway_from_env( "BLIS_JC_NT" );
// dim_t kc_way = bli_read_nway_from_env( "BLIS_KC_NT" );
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" );
// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" );
dim_t kc_way = 1;
dim_t ic_way = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_way = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_way = bli_read_nway_from_env( "BLIS_IR_NT" );
dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" );
dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" );
dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" );
#else
dim_t jc_way = 1;
dim_t kc_way = 1;
dim_t ic_way = 1;
dim_t ic_way = 1;
dim_t jr_way = 1;
dim_t ir_way = 1;
#endif
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
@@ -137,78 +64,77 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
dim_t ir_nt = 1;
herk_thrinfo_t** paths = (herk_thrinfo_t**) bli_malloc_intl( global_num_threads * sizeof( herk_thrinfo_t* ) );
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
{
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
{
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
{
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
{
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
for( int e = 0; e < ir_way; e++ )
{
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
herk_thrinfo_t* ir_info = bli_create_herk_thrinfo_node( jr_comm, jr_comm_id,
// Macrokernel loops
thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
ir_way, e,
NULL, NULL, NULL);
herk_thrinfo_t* jr_info = bli_create_herk_thrinfo_node( ic_comm, ic_comm_id,
thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
jr_way, d,
NULL, NULL, ir_info);
//blk_var_1
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
herk_thrinfo_t* ic_info = bli_create_herk_thrinfo_node( kc_comm, kc_comm_id,
thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
ic_way, c,
pack_ic_out, pack_ic_in, jr_info);
//blk_var_3
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
herk_thrinfo_t* kc_info = bli_create_herk_thrinfo_node( jc_comm, jc_comm_id,
thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
pack_kc_out, pack_kc_in, ic_info);
//blk_var_2
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
herk_thrinfo_t* jc_info = bli_create_herk_thrinfo_node( global_comm, global_comm_id,
thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
pack_jc_out, pack_jc_in, kc_info);
@@ -221,3 +147,4 @@ herk_thrinfo_t** bli_create_herk_thrinfo_paths( )
}
return paths;
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define bli_thrinfo_sub_self( thread ) thread->sub_l3op
#define bli_thrinfo_sub_opackm( thread ) thread->opackm
#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm
// For use in gemm micro-kernel
#define gemm_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define gemm_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
//thrinfo_t** bli_gemm_thrinfo_create_paths( void );

View File

@@ -86,11 +86,11 @@ void bli_hemm_front( side_t side,
bli_obj_swap( a_local, b_local );
}
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HEMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_gemm_int,
alpha,
&a_local,
@@ -101,7 +101,7 @@ void bli_hemm_front( side_t side,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -118,11 +118,11 @@ void bli_her2k_front( obj_t* alpha,
#else
// Invoke herk twice, using beta only the first time.
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HER2K, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
alpha,
&a_local,
@@ -133,7 +133,7 @@ void bli_her2k_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
&alpha_conj,
&b_local,
@@ -144,7 +144,7 @@ void bli_her2k_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
#endif

View File

@@ -39,7 +39,7 @@ void bli_herk_blk_var1f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t ah_pack_s;
obj_t a1_pack_s, c1_pack_s;
@@ -55,36 +55,36 @@ void bli_herk_blk_var1f( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_m( a, ah, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing A'.
bli_obj_init_pack( &ah_pack_s );
bli_packm_init( ah, &ah_pack_s,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
ah_pack = thread_obroadcast( thread, &ah_pack_s );
ah_pack = bli_thread_obroadcast( thread, &ah_pack_s );
// Initialize pack objects that are passed into packm_init() for A and C.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack A' (if instructed).
bli_packm_int( ah, ah_pack,
cntx, cntl_sub_packm_b( cntl ),
herk_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_weighted_t2b( thread, c,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_weighted_t2b( thread, c,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the m dimension.
@@ -92,7 +92,7 @@ void bli_herk_blk_var1f( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -101,23 +101,23 @@ void bli_herk_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
@@ -126,25 +126,25 @@ void bli_herk_blk_var1f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( ah_pack, cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( ah_pack, bli_cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_herk_blk_var2f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t ah1_pack_s, c1_pack_s;
@@ -55,35 +55,35 @@ void bli_herk_blk_var2f( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_n( a, ah, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for C and A' that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &ah1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntx, cntl_sub_packm_a( cntl ),
herk_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_weighted_l2r( thread, c,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_weighted_l2r( thread, c,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the n dimension.
@@ -91,7 +91,7 @@ void bli_herk_blk_var2f( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1' and C1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -100,23 +100,23 @@ void bli_herk_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1' and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &ah1, ah1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread ) ;
bli_thread_ibarrier( thread ) ;
// Pack A1' (if instructed).
bli_packm_int( &ah1, ah1_pack,
cntx, cntl_sub_packm_b( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
herk_thread_sub_ipackm( thread ) ) ;
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) ) ;
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
@@ -125,25 +125,25 @@ void bli_herk_blk_var2f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_herk_blk_var3f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t c_pack_s;
obj_t a1_pack_s, ah1_pack_s;
@@ -56,31 +56,31 @@ void bli_herk_blk_var3f( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_herk_prune_unref_mparts_k( a, ah, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing C.
bli_obj_init_pack( &c_pack_s );
bli_packm_init( c, &c_pack_s,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
c_pack = thread_obroadcast( thread, &c_pack_s );
c_pack = bli_thread_obroadcast( thread, &c_pack_s );
// Initialize all pack objects that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &ah1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
ah1_pack = thread_ibroadcast( thread, &ah1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
ah1_pack = bli_thread_ibroadcast( thread, &ah1_pack_s );
// Pack C (if instructed).
bli_packm_int( c, c_pack,
cntx, cntl_sub_packm_c( cntl ),
herk_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_opackm( thread ) );
// Query dimension in partitioning direction.
k_trans = bli_obj_width_after_trans( *a );
@@ -90,7 +90,7 @@ void bli_herk_blk_var3f( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, k_trans, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and A1'.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -99,23 +99,23 @@ void bli_herk_blk_var3f( obj_t* a,
i, b_alg, ah, &ah1 );
// Initialize objects for packing A1 and A1'.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &ah1, ah1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack B1 (if instructed).
bli_packm_int( &ah1, ah1_pack,
cntx, cntl_sub_packm_b( cntl ),
herk_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform herk subproblem.
bli_herk_int( &BLIS_ONE,
@@ -124,8 +124,8 @@ void bli_herk_blk_var3f( obj_t* a,
&BLIS_ONE,
c_pack,
cntx,
cntl_sub_gemm( cntl ),
herk_thread_sub_herk( thread ) );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
@@ -133,26 +133,26 @@ void bli_herk_blk_var3f( obj_t* a,
// And since c_pack is a local obj_t, we can simply overwrite the
// internal beta scalar with BLIS_ONE once it has been used in the
// first iteration.
thread_ibarrier( thread );
if ( i == 0 && thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
bli_thread_ibarrier( thread );
if ( i == 0 && bli_thread_am_ichief( thread ) ) bli_obj_scalar_reset( c_pack );
}
thread_obarrier( thread );
bli_thread_obarrier( thread );
// Unpack C (if C was packed).
bli_unpackm_int( c_pack, c,
cntx, cntl_sub_unpackm_c( cntl ),
herk_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_opackm( thread ) );
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
if( thread_am_ochief( thread ) ) {
bli_packm_release( c_pack, cntl_sub_packm_c( cntl ) );
if( bli_thread_am_ochief( thread ) ) {
bli_packm_release( c_pack, bli_cntl_sub_packm_c( cntl ) );
}
if( thread_am_ichief( thread ) ) {
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( ah1_pack, cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( ah1_pack, bli_cntl_sub_packm_b( cntl ) );
}
}

View File

@@ -84,11 +84,11 @@ void bli_herk_front( obj_t* alpha,
bli_obj_induce_trans( c_local );
}
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_HERK, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
alpha,
&a_local,
@@ -99,7 +99,7 @@ void bli_herk_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
// The Hermitian rank-k product was computed as A*A', even for the
// diagonal elements. Mathematically, the imaginary components of

View File

@@ -41,7 +41,7 @@ typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread );
thrinfo_t* thread );
static FUNCPTR_T vars[2][4][3] =
{
@@ -70,7 +70,7 @@ void bli_herk_int( obj_t* alpha,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_local;
obj_t ah_local;
@@ -91,9 +91,9 @@ void bli_herk_int( obj_t* alpha,
if ( bli_obj_has_zero_dim( *a ) ||
bli_obj_has_zero_dim( *ah ) )
{
if( thread_am_ochief( thread ) )
if( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
thread_obarrier( thread );
bli_thread_obarrier( thread );
return;
}
@@ -109,7 +109,7 @@ void bli_herk_int( obj_t* alpha,
// strides and dimensions. Note that this transposition would normally
// be handled explicitly in the packing of C, but if C is not being
// packed, this is our last chance to handle the transposition.
if ( cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( *c ) )
{
bli_obj_induce_trans( c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, c_local );
@@ -134,8 +134,8 @@ void bli_herk_int( obj_t* alpha,
else uplo = 1;
// Extract the variant number and implementation type.
n = cntl_var_num( cntl );
i = cntl_impl_type( cntl );
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];

View File

@@ -39,5 +39,5 @@ void bli_herk_int( obj_t* alpha,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread );
thrinfo_t* thread );

View File

@@ -51,7 +51,7 @@ typedef void (*FUNCPTR_T)(
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
herk_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
@@ -62,7 +62,7 @@ void bli_herk_l_ker_var2( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -151,7 +151,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
herk_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -270,11 +270,11 @@ void PASTEMAC(ch,varname) \
b1 = b_cast; \
c1 = c_cast; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \

View File

@@ -1,79 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct herk_thrinfo_s //implements thrinfo_t
{
thread_comm_t* ocomm; //The thread communicator for the other threads sharing the same work at this level
dim_t ocomm_id; //Our thread id within that thread comm
thread_comm_t* icomm; //The thread communicator for the other threads sharing the same work at this level
dim_t icomm_id; //Our thread id within that thread comm
dim_t n_way; //Number of distinct caucuses used to parallelize the loop
dim_t work_id; //What we're working on
packm_thrinfo_t* opackm;
packm_thrinfo_t* ipackm;
struct herk_thrinfo_s* sub_herk;
};
typedef struct herk_thrinfo_s herk_thrinfo_t;
#define herk_thread_sub_herk( thread ) thread->sub_herk
#define herk_thread_sub_opackm( thread ) thread->opackm
#define herk_thread_sub_ipackm( thread ) thread->ipackm
// For use in herk micro-kernel
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
herk_thrinfo_t** bli_create_herk_thrinfo_paths( );
void bli_herk_thrinfo_free_paths( herk_thrinfo_t** paths, dim_t n_threads );
void bli_setup_herk_thrinfo_node( herk_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk );
herk_thrinfo_t* bli_create_herk_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
herk_thrinfo_t* sub_herk );
void bli_setup_herk_single_threaded_info( herk_thrinfo_t* thread );

View File

@@ -51,7 +51,7 @@ typedef void (*FUNCPTR_T)(
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
herk_thrinfo_t* thread
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
@@ -62,7 +62,7 @@ void bli_herk_u_ker_var2( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
herk_thrinfo_t* thread )
thrinfo_t* thread )
{
num_t dt_exec = bli_obj_execution_datatype( *c );
@@ -151,7 +151,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
herk_thrinfo_t* thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -270,11 +270,11 @@ void PASTEMAC(ch,varname) \
b1 = b_cast; \
c1 = c_cast; \
\
herk_thrinfo_t* caucus = herk_thread_sub_herk( thread ); \
dim_t jr_num_threads = thread_n_way( thread ); \
dim_t jr_thread_id = thread_work_id( thread ); \
dim_t ir_num_threads = thread_n_way( caucus ); \
dim_t ir_thread_id = thread_work_id( caucus ); \
thrinfo_t* caucus = bli_thrinfo_sub_self( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \

View File

@@ -47,7 +47,7 @@ void PASTEMAC0(opname) \
obj_t* c, \
cntx_t* cntx, \
gemm_t* cntl, \
herk_thrinfo_t* thread \
thrinfo_t* thread \
);
GENPROT( herk_blk_var1f )
@@ -81,7 +81,7 @@ void PASTEMAC(ch,varname) \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
herk_thrinfo_t* thread \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC( herk_l_ker_var2 )

View File

@@ -35,105 +35,25 @@
#include "blis.h"
#include "assert.h"
void bli_setup_trsm_thrinfo_node( trsm_thrinfo_t* thread,
thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm )
#if 0
thrinfo_t** bli_herk_thrinfo_create_paths( void )
{
thread->ocomm = ocomm;
thread->ocomm_id = ocomm_id;
thread->icomm = icomm;
thread->icomm_id = icomm_id;
thread->n_way = n_way;
thread->work_id = work_id;
thread->opackm = opackm;
thread->ipackm = ipackm;
thread->sub_trsm = sub_trsm;
}
void bli_setup_trsm_single_threaded_info( trsm_thrinfo_t* thread )
{
thread->ocomm = &BLIS_SINGLE_COMM;
thread->ocomm_id = 0;
thread->icomm = &BLIS_SINGLE_COMM;
thread->icomm_id = 0;
thread->n_way = 1;
thread->work_id = 0;
thread->opackm = &BLIS_PACKM_SINGLE_THREADED;
thread->ipackm = &BLIS_PACKM_SINGLE_THREADED;
thread->sub_trsm = thread;
}
trsm_thrinfo_t* bli_create_trsm_thrinfo_node( thread_comm_t* ocomm, dim_t ocomm_id,
thread_comm_t* icomm, dim_t icomm_id,
dim_t n_way, dim_t work_id,
packm_thrinfo_t* opackm,
packm_thrinfo_t* ipackm,
trsm_thrinfo_t* sub_trsm )
{
trsm_thrinfo_t* thread = ( trsm_thrinfo_t* ) bli_malloc_intl( sizeof( trsm_thrinfo_t ) );
bli_setup_trsm_thrinfo_node( thread, ocomm, ocomm_id,
icomm, icomm_id,
n_way, work_id,
opackm,
ipackm,
sub_trsm );
return thread;
}
void bli_trsm_thrinfo_free( trsm_thrinfo_t* thread)
{
if( thread == NULL ) return;
// Free Communicators
if( thread_am_ochief( thread ) )
bli_free_communicator( thread->ocomm );
if( thread->sub_trsm == NULL && thread_am_ichief( thread ) )
bli_free_communicator( thread->icomm );
// Free Sub Thrinfos
bli_packm_thrinfo_free( thread->opackm );
bli_packm_thrinfo_free( thread->ipackm );
bli_trsm_thrinfo_free( thread->sub_trsm );
bli_free_intl( thread );
return;
}
void bli_trsm_thrinfo_free_paths( trsm_thrinfo_t** threads, dim_t num )
{
for( int i = 0; i < num; i++)
bli_trsm_thrinfo_free( threads[i] );
bli_free_intl( threads );
}
trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
{
dim_t jc_way = 1;
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_way = bli_env_read_nway( "BLIS_JC_NT" );
// dim_t kc_way = bli_env_read_nway( "BLIS_KC_NT" );
dim_t kc_way = 1;
dim_t ic_way = bli_env_read_nway( "BLIS_IC_NT" );
dim_t jr_way = bli_env_read_nway( "BLIS_JR_NT" );
dim_t ir_way = bli_env_read_nway( "BLIS_IR_NT" );
#else
dim_t jc_way = 1;
dim_t kc_way = 1;
dim_t ic_way = 1;
dim_t jr_way = 1;
dim_t ir_way = 1;
#ifdef BLIS_ENABLE_MULTITHREADING
dim_t jc_in = bli_read_nway_from_env( "BLIS_JC_NT" );
/*dim_t kc_in = bli_read_nway_from_env( "BLIS_KC_NT" );*/
dim_t ic_in = bli_read_nway_from_env( "BLIS_IC_NT" );
dim_t jr_in = bli_read_nway_from_env( "BLIS_JR_NT" );
dim_t ir_in = bli_read_nway_from_env( "BLIS_IR_NT" );
if(right_sided) {
ic_way = jc_in * ic_in * jr_in;
ir_way = ir_in;
}
else {
jc_way = jc_in;
jr_way = jr_in * ic_in * ir_in;
}
#endif
dim_t global_num_threads = jc_way * kc_way * ic_way * jr_way * ir_way;
assert( global_num_threads != 0 );
@@ -144,78 +64,77 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
dim_t ir_nt = 1;
trsm_thrinfo_t** paths = (trsm_thrinfo_t**) bli_malloc_intl( global_num_threads * sizeof( trsm_thrinfo_t* ) );
thrinfo_t** paths = bli_malloc_intl( global_num_threads * sizeof( thrinfo_t* ) );
thread_comm_t* global_comm = bli_create_communicator( global_num_threads );
thrcomm_t* global_comm = bli_thrcomm_create( global_num_threads );
for( int a = 0; a < jc_way; a++ )
{
thread_comm_t* jc_comm = bli_create_communicator( jc_nt );
{
thrcomm_t* jc_comm = bli_thrcomm_create( jc_nt );
for( int b = 0; b < kc_way; b++ )
{
thread_comm_t* kc_comm = bli_create_communicator( kc_nt );
{
thrcomm_t* kc_comm = bli_thrcomm_create( kc_nt );
for( int c = 0; c < ic_way; c++ )
{
thread_comm_t* ic_comm = bli_create_communicator( ic_nt );
{
thrcomm_t* ic_comm = bli_thrcomm_create( ic_nt );
for( int d = 0; d < jr_way; d++ )
{
thread_comm_t* jr_comm = bli_create_communicator( jr_nt );
for( int e = 0; e < ir_way; e++)
{
thread_comm_t* ir_comm = bli_create_communicator( ir_nt );
{
thrcomm_t* jr_comm = bli_thrcomm_create( jr_nt );
for( int e = 0; e < ir_way; e++ )
{
thrcomm_t* ir_comm = bli_thrcomm_create( ir_nt );
dim_t ir_comm_id = 0;
dim_t jr_comm_id = e*ir_nt + ir_comm_id;
dim_t ic_comm_id = d*jr_nt + jr_comm_id;
dim_t kc_comm_id = c*ic_nt + ic_comm_id;
dim_t jc_comm_id = b*kc_nt + kc_comm_id;
dim_t global_comm_id = a*jc_nt + jc_comm_id;
// Macrokernel loops
trsm_thrinfo_t* ir_info = bli_create_trsm_thrinfo_node( jr_comm, jr_comm_id,
// Macrokernel loops
thrinfo_t* ir_info = bli_l3_thrinfo_create_node( jr_comm, jr_comm_id,
ir_comm, ir_comm_id,
ir_way, e,
NULL, NULL, NULL);
trsm_thrinfo_t* jr_info = bli_create_trsm_thrinfo_node( ic_comm, ic_comm_id,
thrinfo_t* jr_info = bli_l3_thrinfo_create_node( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
jr_way, d,
NULL, NULL, ir_info);
//blk_var_1
packm_thrinfo_t* pack_ic_in = bli_create_packm_thread_info( ic_comm, ic_comm_id,
packm_thrinfo_t* pack_ic_in = bli_packm_thrinfo_create( ic_comm, ic_comm_id,
jr_comm, jr_comm_id,
ic_nt, ic_comm_id );
packm_thrinfo_t* pack_ic_out = bli_create_packm_thread_info( kc_comm, kc_comm_id,
packm_thrinfo_t* pack_ic_out = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
trsm_thrinfo_t* ic_info = bli_create_trsm_thrinfo_node( kc_comm, kc_comm_id,
thrinfo_t* ic_info = bli_l3_thrinfo_create_node( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
ic_way, c,
pack_ic_out, pack_ic_in, jr_info);
//blk_var_3
packm_thrinfo_t* pack_kc_in = bli_create_packm_thread_info( kc_comm, kc_comm_id,
packm_thrinfo_t* pack_kc_in = bli_packm_thrinfo_create( kc_comm, kc_comm_id,
ic_comm, ic_comm_id,
kc_nt, kc_comm_id );
packm_thrinfo_t* pack_kc_out = bli_create_packm_thread_info( jc_comm, jc_comm_id,
packm_thrinfo_t* pack_kc_out = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
jc_comm, jc_comm_id,
jc_nt, jc_comm_id );
trsm_thrinfo_t* kc_info = bli_create_trsm_thrinfo_node( jc_comm, jc_comm_id,
thrinfo_t* kc_info = bli_l3_thrinfo_create_node( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
kc_way, b,
pack_kc_out, pack_kc_in, ic_info);
//blk_var_2
packm_thrinfo_t* pack_jc_in = bli_create_packm_thread_info( jc_comm, jc_comm_id,
packm_thrinfo_t* pack_jc_in = bli_packm_thrinfo_create( jc_comm, jc_comm_id,
kc_comm, kc_comm_id,
jc_nt, jc_comm_id );
packm_thrinfo_t* pack_jc_out = bli_create_packm_thread_info( global_comm, global_comm_id,
packm_thrinfo_t* pack_jc_out = bli_packm_thrinfo_create( global_comm, global_comm_id,
jc_comm, jc_comm_id,
global_num_threads, global_comm_id );
trsm_thrinfo_t* jc_info = bli_create_trsm_thrinfo_node( global_comm, global_comm_id,
thrinfo_t* jc_info = bli_l3_thrinfo_create_node( global_comm, global_comm_id,
jc_comm, jc_comm_id,
jc_way, a,
pack_jc_out, pack_jc_in, kc_info);
@@ -228,3 +147,4 @@ trsm_thrinfo_t** bli_create_trsm_thrinfo_paths( bool_t right_sided )
}
return paths;
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define bli_thrinfo_sub_self( thread ) thread->sub_l3op
#define bli_thrinfo_sub_opackm( thread ) thread->opackm
#define bli_thrinfo_sub_ipackm( thread ) thread->ipackm
// For use in herk micro-kernel
#define herk_get_next_a_micropanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define herk_get_next_b_micropanel( thread, b1, step ) ( b1 + step * thread->n_way )
//thrinfo_t** bli_herk_thrinfo_create_paths( void );

View File

@@ -85,11 +85,11 @@ void bli_symm_front( side_t side,
bli_obj_swap( a_local, b_local );
}
gemm_thrinfo_t** infos = bli_create_gemm_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYMM, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_gemm_int,
alpha,
&a_local,
@@ -100,7 +100,7 @@ void bli_symm_front( side_t side,
(void*) cntl,
(void**) infos );
bli_gemm_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -98,11 +98,11 @@ void bli_syr2k_front( obj_t* alpha,
cntl );
#else
// Invoke herk twice, using beta only the first time.
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYR2K, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
alpha,
&a_local,
@@ -113,7 +113,7 @@ void bli_syr2k_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
alpha,
&b_local,
@@ -124,7 +124,7 @@ void bli_syr2k_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
#endif
}

View File

@@ -78,11 +78,11 @@ void bli_syrk_front( obj_t* alpha,
bli_obj_induce_trans( c_local );
}
herk_thrinfo_t** infos = bli_create_herk_thrinfo_paths();
dim_t n_threads = thread_num_threads( infos[0] );
thrinfo_t** infos = bli_l3_thrinfo_create_paths( BLIS_SYRK, BLIS_LEFT );
dim_t n_threads = bli_thread_num_threads( infos[0] );
// Invoke the internal back-end.
bli_level3_thread_decorator( n_threads,
bli_l3_thread_decorator( n_threads,
(l3_int_t) bli_herk_int,
alpha,
&a_local,
@@ -93,7 +93,7 @@ void bli_syrk_front( obj_t* alpha,
(void*) cntl,
(void**) infos );
bli_herk_thrinfo_free_paths( infos, n_threads );
bli_l3_thrinfo_free_paths( infos, n_threads );
}

View File

@@ -39,7 +39,7 @@ void bli_trmm_blk_var1f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
trmm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t b_pack_s;
obj_t a1_pack_s, c1_pack_s;
@@ -55,32 +55,32 @@ void bli_trmm_blk_var1f( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_m( a, b, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing B.
bli_obj_init_pack( &b_pack_s );
bli_packm_init( b, &b_pack_s,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
// Scale C by beta (if instructed).
// Since scalm doesn't support multithreading yet, must be done by chief thread (ew)
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
b_pack = thread_obroadcast( thread, &b_pack_s );
b_pack = bli_thread_obroadcast( thread, &b_pack_s );
// Initialize all pack objects that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &a1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
a1_pack = thread_ibroadcast( thread, &a1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
a1_pack = bli_thread_ibroadcast( thread, &a1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack B (if instructed).
bli_packm_int( b, b_pack,
cntx, cntl_sub_packm_b( cntl ),
trmm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_opackm( thread ) );
// Set the default length of and offset to the non-zero part of A.
//m_trans = bli_obj_length_after_trans( *a );
@@ -96,8 +96,8 @@ void bli_trmm_blk_var1f( obj_t* a,
// bli_obj_width_after_trans( *a );
dim_t my_start, my_end;
bli_get_range_weighted_t2b( thread, a,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_weighted_t2b( thread, a,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the m dimension.
@@ -105,7 +105,7 @@ void bli_trmm_blk_var1f( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, my_end, a,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
@@ -114,23 +114,23 @@ void bli_trmm_blk_var1f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and C1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack A1 (if instructed).
bli_packm_int( &a1, a1_pack,
cntx, cntl_sub_packm_a( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
@@ -139,24 +139,24 @@ void bli_trmm_blk_var1f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( b_pack, cntl_sub_packm_b( cntl ) );
if( thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( b_pack, bli_cntl_sub_packm_b( cntl ) );
if( bli_thread_am_ichief( thread ) ){
bli_packm_release( a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_trmm_blk_var2b( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
trmm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
@@ -55,35 +55,35 @@ void bli_trmm_blk_var2b( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_n( a, b, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntx, cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_weighted_r2l( thread, b,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_weighted_r2l( thread, b,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the n dimension.
@@ -91,7 +91,7 @@ void bli_trmm_blk_var2b( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( i, my_end, b,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for B1 and C1.
bli_acquire_mpart_r2l( BLIS_SUBPART1,
@@ -100,23 +100,23 @@ void bli_trmm_blk_var2b( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
@@ -125,24 +125,24 @@ void bli_trmm_blk_var2b( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

View File

@@ -39,7 +39,7 @@ void bli_trmm_blk_var2f( obj_t* a,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
trmm_thrinfo_t* thread )
thrinfo_t* thread )
{
obj_t a_pack_s;
obj_t b1_pack_s, c1_pack_s;
@@ -55,35 +55,35 @@ void bli_trmm_blk_var2f( obj_t* a,
// Prune any zero region that exists along the partitioning dimension.
bli_trmm_prune_unref_mparts_n( a, b, c );
if( thread_am_ochief( thread ) ) {
if( bli_thread_am_ochief( thread ) ) {
// Initialize object for packing A
bli_obj_init_pack( &a_pack_s );
bli_packm_init( a, &a_pack_s,
cntx, cntl_sub_packm_a( cntl ) );
cntx, bli_cntl_sub_packm_a( cntl ) );
// Scale C by beta (if instructed).
bli_scalm_int( &BLIS_ONE,
c,
cntx, cntl_sub_scalm( cntl ) );
cntx, bli_cntl_sub_scalm( cntl ) );
}
a_pack = thread_obroadcast( thread, &a_pack_s );
a_pack = bli_thread_obroadcast( thread, &a_pack_s );
// Initialize pack objects for B and C that are passed into packm_init().
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_obj_init_pack( &b1_pack_s );
bli_obj_init_pack( &c1_pack_s );
}
b1_pack = thread_ibroadcast( thread, &b1_pack_s );
c1_pack = thread_ibroadcast( thread, &c1_pack_s );
b1_pack = bli_thread_ibroadcast( thread, &b1_pack_s );
c1_pack = bli_thread_ibroadcast( thread, &c1_pack_s );
// Pack A (if instructed).
bli_packm_int( a, a_pack,
cntx, cntl_sub_packm_a( cntl ),
trmm_thread_sub_opackm( thread ) );
cntx, bli_cntl_sub_packm_a( cntl ),
bli_thrinfo_sub_opackm( thread ) );
dim_t my_start, my_end;
bli_get_range_weighted_l2r( thread, b,
bli_cntx_get_bmult( cntl_bszid( cntl ), cntx ),
bli_thread_get_range_weighted_l2r( thread, b,
bli_cntx_get_bmult( bli_cntl_bszid( cntl ), cntx ),
&my_start, &my_end );
// Partition along the n dimension.
@@ -91,7 +91,7 @@ void bli_trmm_blk_var2f( obj_t* a,
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, my_end, b,
cntl_bszid( cntl ), cntx );
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for B1 and C1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
@@ -100,23 +100,23 @@ void bli_trmm_blk_var2f( obj_t* a,
i, b_alg, c, &c1 );
// Initialize objects for packing A1 and B1.
if( thread_am_ichief( thread ) ) {
if( bli_thread_am_ichief( thread ) ) {
bli_packm_init( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ) );
cntx, bli_cntl_sub_packm_b( cntl ) );
bli_packm_init( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ) );
cntx, bli_cntl_sub_packm_c( cntl ) );
}
thread_ibarrier( thread );
bli_thread_ibarrier( thread );
// Pack B1 (if instructed).
bli_packm_int( &b1, b1_pack,
cntx, cntl_sub_packm_b( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_b( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Pack C1 (if instructed).
bli_packm_int( &c1, c1_pack,
cntx, cntl_sub_packm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_packm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
// Perform trmm subproblem.
bli_trmm_int( &BLIS_ONE,
@@ -125,24 +125,24 @@ void bli_trmm_blk_var2f( obj_t* a,
&BLIS_ONE,
c1_pack,
cntx,
cntl_sub_gemm( cntl ),
trmm_thread_sub_trmm( thread ) );
thread_ibarrier( thread );
bli_cntl_sub_gemm( cntl ),
bli_thrinfo_sub_self( thread ) );
bli_thread_ibarrier( thread );
// Unpack C1 (if C1 was packed).
bli_unpackm_int( c1_pack, &c1,
cntx, cntl_sub_unpackm_c( cntl ),
trmm_thread_sub_ipackm( thread ) );
cntx, bli_cntl_sub_unpackm_c( cntl ),
bli_thrinfo_sub_ipackm( thread ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
thread_obarrier( thread );
if( thread_am_ochief( thread ) )
bli_packm_release( a_pack, cntl_sub_packm_a( cntl ) );
if( thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, cntl_sub_packm_c( cntl ) );
bli_thread_obarrier( thread );
if( bli_thread_am_ochief( thread ) )
bli_packm_release( a_pack, bli_cntl_sub_packm_a( cntl ) );
if( bli_thread_am_ichief( thread ) ) {
bli_packm_release( b1_pack, bli_cntl_sub_packm_b( cntl ) );
bli_packm_release( c1_pack, bli_cntl_sub_packm_c( cntl ) );
}
}

Some files were not shown because too many files have changed in this diff Show More