mirror of
https://github.com/amd/blis.git
synced 2026-05-13 18:52:14 +00:00
Enable user-customized packm ukernel/variant. (#549)
Details:
- Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and
.ker_params. These fields store pointers to functions and data that
will allow the user to more flexibly create custom operations while
recycling BLIS's existing partitioning infrastructure.
- Updated typed API to packm variant and structure-aware kernels to
replace the diagonal offset with panel offsets, and changed strides
of both C and P to inc/ldim semantics. Updated object API to the packm
variant to include rntm_t*.
- Removed the packm variant function pointer from the packm cntl_t node
definition since it has been replaced by the .pack_fn pointer in the
obj_t.
- Updated bli_packm_int() to read the new packm variant function pointer
from the obj_t and call it instead of from the cntl_t node.
- Moved some of the logic of bli_l3_packm.c to a new file,
bli_packm_alloc.c.
- Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers
instead of typed pointers, allowing a single function to be used
regardless of datatype. This obviated having a separate implementation
in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a
new function, bli_packm_scalar().
- Employed a new standard whereby right-hand matrix operands ("B") are
always packed as column-stored row panels -- that is, identically to
that of left-hand matrix operands ("A"). This means that while we pack
matrix A normally, we actually pack B in a transposed state. This
allowed us to simplify a lot of code throughout the framework, and
also affected some of the logic in bli_l3_packa() and _packb().
- Simplified bli_packm_init.c in light of the new B^T convention
described above. bli_packm_init()--which is now called from within
bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns
a bool that indicates whether packing should be performed (or
skipped).
- Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(),
which, among other things, defaults the new .pack_fn field of the
obj_t to bli_packm_blk_var1() if the field is NULL.
- Defined a new function, bli_obj_reset_origin(), which permanently
refocuses the view of an object so that it "forgets" any offsets from
its original pointer. This function also sets the object's root field
to itself. Calls to bli_obj_reset_origin() for each matrix operand
appear in the _front() functions, after the obj_t's are aliased. This
resetting of the underlying matrices' origins is needed in preparation
for more advanced features from within custom packm kernels.
- Redefined bli_pba_rntm_set_pba() from a regular function to a static
inline function.
- Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use
libblis_test_pobj_create() to create local packed objects. Previously,
these packed objects were created by calling lower-level functions.
This commit is contained in:
@@ -1307,7 +1307,6 @@ bli_pba_init_pools
|
||||
bli_pba_pool_size
|
||||
bli_pba_query
|
||||
bli_pba_release
|
||||
bli_pba_rntm_set_pba
|
||||
bli_memsys_finalize
|
||||
bli_memsys_init
|
||||
bli_mkherm
|
||||
|
||||
@@ -50,21 +50,23 @@
|
||||
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
);
|
||||
|
||||
INSERT_GENTDEF( packm )
|
||||
|
||||
@@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
|
||||
obj_t* a, \
|
||||
obj_t* p, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
cntl_t* cntl, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
@@ -33,15 +33,15 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_packm_alloc.h"
|
||||
#include "bli_packm_cntl.h"
|
||||
#include "bli_packm_check.h"
|
||||
#include "bli_packm_init.h"
|
||||
#include "bli_packm_int.h"
|
||||
#include "bli_packm_scalar.h"
|
||||
|
||||
#include "bli_packm_part.h"
|
||||
|
||||
#include "bli_packm_var.h"
|
||||
|
||||
#include "bli_packm_struc_cxk.h"
|
||||
#include "bli_packm_struc_cxk_1er.h"
|
||||
|
||||
@@ -50,6 +50,8 @@
|
||||
|
||||
// Mixed datatype support.
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#include "bli_packm_md.h"
|
||||
#include "bli_packm_struc_cxk_md.h"
|
||||
#endif
|
||||
|
||||
#include "bli_packm_blk_var1.h"
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -33,78 +33,67 @@
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// Prototype object-based interfaces.
|
||||
//
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* c, \
|
||||
obj_t* p, \
|
||||
cntx_t* cntx, \
|
||||
cntl_t* cntl, \
|
||||
thrinfo_t* t \
|
||||
);
|
||||
void* bli_packm_alloc
|
||||
(
|
||||
siz_t size_needed,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
// Query the pack buffer type from the control tree node.
|
||||
packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
|
||||
|
||||
GENPROT( packm_unb_var1 )
|
||||
GENPROT( packm_blk_var1 )
|
||||
// Query the address of the mem_t entry within the control tree node.
|
||||
mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
//
|
||||
mem_t* local_mem_p;
|
||||
mem_t local_mem_s;
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
siz_t cntl_mem_size = 0;
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_unb_var1 )
|
||||
if ( bli_mem_is_alloc( cntl_mem_p ) )
|
||||
cntl_mem_size = bli_mem_size( cntl_mem_p );
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
bool revifup, \
|
||||
bool reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void_fp packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
if ( cntl_mem_size < size_needed )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
{
|
||||
// The chief thread releases the existing block associated with
|
||||
// the mem_t entry in the control tree, and then re-acquires a
|
||||
// new block, saving the associated mem_t entry to local_mem_s.
|
||||
if ( bli_mem_is_alloc( cntl_mem_p ) )
|
||||
{
|
||||
bli_pba_release
|
||||
(
|
||||
rntm,
|
||||
cntl_mem_p
|
||||
);
|
||||
}
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm,
|
||||
size_needed,
|
||||
pack_buf_type,
|
||||
&local_mem_s
|
||||
);
|
||||
}
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_blk_var1 )
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
*cntl_mem_p = *local_mem_p;
|
||||
|
||||
// Barrier so that the master thread doesn't return from the function
|
||||
// before we are done reading.
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
return bli_mem_buffer( cntl_mem_p );
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -33,13 +32,11 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_l3_packm
|
||||
(
|
||||
obj_t* x,
|
||||
obj_t* x_pack,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
BLIS_EXPORT_BLIS void* bli_packm_alloc
|
||||
(
|
||||
siz_t size_needed,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -35,35 +35,6 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
bool invdiag,
|
||||
bool revifup,
|
||||
bool reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t is_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
void_fp packm_ker,
|
||||
cntx_t* cntx,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
|
||||
|
||||
|
||||
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
{
|
||||
@@ -79,614 +50,265 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
|
||||
NULL, bli_zpackm_struc_cxk_1er, } },
|
||||
};
|
||||
|
||||
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
|
||||
|
||||
void bli_packm_blk_var1
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// Call a different packm implementation when the storage and target
|
||||
// datatypes differ.
|
||||
if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) )
|
||||
{
|
||||
bli_packm_blk_var1_md( c, p, cntx, cntl, t );
|
||||
// Extract various fields from the control tree.
|
||||
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
|
||||
bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
|
||||
bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
|
||||
|
||||
// Every thread initializes p and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
// entry in the control tree node). Return early if no packing is required.
|
||||
if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
num_t dt_p = bli_obj_dt( p );
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_packm_int_check( c, p, cntx );
|
||||
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
bool invdiag = bli_obj_has_inverted_diag( p );
|
||||
bool revifup = bli_obj_is_pack_rev_if_upper( p );
|
||||
bool reviflo = bli_obj_is_pack_rev_if_lower( p );
|
||||
num_t dt_c = bli_obj_dt( c );
|
||||
dim_t dt_c_size = bli_dt_size( dt_c );
|
||||
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
num_t dt_p = bli_obj_dt( p );
|
||||
dim_t dt_p_size = bli_dt_size( dt_p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
conj_t conjc = bli_obj_conj_status( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t pd_p = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
dim_t iter_dim = bli_obj_length( p );
|
||||
dim_t panel_len_full = bli_obj_width( p );
|
||||
dim_t panel_len_max = bli_obj_padded_width( p );
|
||||
|
||||
obj_t kappa;
|
||||
void* buf_kappa;
|
||||
char* c_cast = bli_obj_buffer_at_off( c );
|
||||
inc_t incc = bli_obj_row_stride( c );
|
||||
inc_t ldc = bli_obj_col_stride( c );
|
||||
dim_t panel_dim_off = bli_obj_row_off( c );
|
||||
dim_t panel_len_off = bli_obj_col_off( c );
|
||||
|
||||
func_t* packm_kers;
|
||||
void_fp packm_ker;
|
||||
char* p_cast = bli_obj_buffer( p );
|
||||
inc_t ldp = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t panel_dim_max = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
|
||||
FUNCPTR_T f;
|
||||
doff_t diagoffc_inc = ( doff_t )panel_dim_max;
|
||||
|
||||
obj_t kappa_local;
|
||||
char* kappa_cast = bli_packm_scalar( &kappa_local, p );
|
||||
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_nat_packed( schema ) )
|
||||
// we use the default lookup table to determine the right func_t
|
||||
// for the current schema.
|
||||
func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
|
||||
|
||||
// For mixed-precision gemm, select the proper kernel (only dense panels).
|
||||
if ( dt_c != dt_p )
|
||||
{
|
||||
// This branch is for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
|
||||
packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
|
||||
}
|
||||
else // if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
obj_t* kappa_p;
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
// Query the address of the packm params field of the obj_t. The user might
|
||||
// have set this field in order to specify a custom packm kernel.
|
||||
packm_blk_var1_params_t* params = bli_obj_pack_params( c );
|
||||
|
||||
if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
|
||||
{
|
||||
// Query the user-provided packing kernel from the obj_t. If provided,
|
||||
// this overrides the kernel determined above.
|
||||
packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
|
||||
}
|
||||
|
||||
/* Compute the total number of iterations we'll need. */
|
||||
dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
|
||||
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */
|
||||
dim_t ic0, ip0;
|
||||
doff_t ic_inc, ip_inc;
|
||||
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) ||
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) )
|
||||
{
|
||||
ic0 = (n_iter - 1) * panel_dim_max;
|
||||
ic_inc = -panel_dim_max;
|
||||
ip0 = n_iter - 1;
|
||||
ip_inc = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
ic0 = 0;
|
||||
ic_inc = panel_dim_max;
|
||||
ip0 = 0;
|
||||
ip_inc = 1;
|
||||
}
|
||||
|
||||
// Query the number of threads and thread ids from the current thread's
|
||||
// packm thrinfo_t node.
|
||||
const dim_t nt = bli_thread_n_way( thread );
|
||||
const dim_t tid = bli_thread_work_id( thread );
|
||||
|
||||
// Determine the thread range and increment using the current thread's
|
||||
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
// will depend on whether slab or round-robin partitioning was requested
|
||||
// at configure-time.
|
||||
dim_t it_start, it_end, it_inc;
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
|
||||
|
||||
char* p_begin = p_cast;
|
||||
|
||||
// Iterate over every logical micropanel in the source matrix.
|
||||
for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter;
|
||||
ic += ic_inc, ip += ip_inc, it += 1 )
|
||||
{
|
||||
dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic );
|
||||
dim_t panel_dim_off_i = panel_dim_off + ic;
|
||||
|
||||
doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc;
|
||||
char* c_begin = c_cast + (ic )*incc*dt_c_size;
|
||||
|
||||
inc_t p_inc = ps_p;
|
||||
|
||||
// NOTE: We MUST use round-robin partitioning when packing
|
||||
// micropanels of a triangular matrix. Hermitian/symmetric
|
||||
// and general packing may use slab or round-robin, depending
|
||||
// on which was selected at configure-time.
|
||||
// The definition of bli_packm_my_iter() will depend on whether slab
|
||||
// or round-robin partitioning was requested at configure-time.
|
||||
bool my_iter = bli_is_triangular( strucc )
|
||||
? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
|
||||
: bli_packm_my_iter ( it, it_start, it_end, tid, nt );
|
||||
|
||||
if ( bli_is_triangular( strucc ) &&
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )
|
||||
{
|
||||
//printf( "applying non-zero imag kappa\n" );
|
||||
// This case executes if the panel belongs to a triangular
|
||||
// matrix AND is completely unstored (ie: zero). If the panel
|
||||
// is unstored, we do nothing. (Notice that we don't even
|
||||
// increment p_begin.)
|
||||
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
continue;
|
||||
}
|
||||
else if ( bli_is_triangular( strucc ) &&
|
||||
bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
|
||||
{
|
||||
// This case executes if the panel belongs to a triangular
|
||||
// matrix AND is diagonal-intersecting. Notice that we
|
||||
// cannot bury the following conditional logic into
|
||||
// packm_struc_cxk() because we need to know the value of
|
||||
// panel_len_max_i so we can properly increment p_inc.
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
// Sanity check. Diagonals should not intersect the short end of
|
||||
// a micro-panel. If they do, then somehow the constraints on
|
||||
// cache blocksizes being a whole multiple of the register
|
||||
// blocksizes was somehow violated.
|
||||
if ( diagoffc_i < 0 )
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
|
||||
|
||||
kappa_p = κ
|
||||
dim_t panel_off_i;
|
||||
dim_t panel_len_i;
|
||||
dim_t panel_len_max_i;
|
||||
|
||||
if ( bli_is_lower( uploc ) )
|
||||
{
|
||||
panel_off_i = 0;
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i;
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max,
|
||||
panel_len_max );
|
||||
}
|
||||
else // if ( bli_is_upper( uploc ) )
|
||||
{
|
||||
panel_off_i = bli_abs( diagoffc_i );
|
||||
panel_len_i = panel_len_full - panel_off_i;
|
||||
panel_len_max_i = panel_len_max - panel_off_i;
|
||||
}
|
||||
|
||||
dim_t panel_len_off_i = panel_off_i + panel_len_off;
|
||||
|
||||
char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size;
|
||||
char* p_use = p_begin;
|
||||
|
||||
// We need to re-compute the imaginary stride as a function of
|
||||
// panel_len_max_i since triangular packed matrices have panels
|
||||
// of varying lengths. NOTE: This imaginary stride value is
|
||||
// only referenced by the packm kernels for induced methods.
|
||||
inc_t is_p_use = ldp * panel_len_max_i;
|
||||
|
||||
// We nudge the imaginary stride up by one if it is odd.
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
|
||||
|
||||
if ( my_iter )
|
||||
{
|
||||
packm_ker_cast( strucc,
|
||||
diagc,
|
||||
uploc,
|
||||
conjc,
|
||||
schema,
|
||||
invdiag,
|
||||
panel_dim_i,
|
||||
panel_len_i,
|
||||
panel_dim_max,
|
||||
panel_len_max_i,
|
||||
panel_dim_off_i,
|
||||
panel_len_off_i,
|
||||
kappa_cast,
|
||||
c_use, incc, ldc,
|
||||
p_use, ldp,
|
||||
is_p_use,
|
||||
cntx,
|
||||
params );
|
||||
}
|
||||
|
||||
// NOTE: This value is usually LESS than ps_p because triangular
|
||||
// matrices usually have several micro-panels that are shorter
|
||||
// than a "full" micro-panel.
|
||||
p_inc = is_p_use;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
// This case executes if the panel is either dense, or belongs
|
||||
// to a Hermitian or symmetric matrix, which includes stored,
|
||||
// unstored, and diagonal-intersecting panels.
|
||||
|
||||
if ( my_iter )
|
||||
{
|
||||
packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
|
||||
diagc,
|
||||
uploc,
|
||||
conjc,
|
||||
schema,
|
||||
invdiag,
|
||||
panel_dim_i,
|
||||
panel_len_full,
|
||||
panel_dim_max,
|
||||
panel_len_max,
|
||||
panel_dim_off_i,
|
||||
panel_len_off,
|
||||
kappa_cast,
|
||||
c_begin, incc, ldc,
|
||||
p_begin, ldp, is_p,
|
||||
cntx,
|
||||
params );
|
||||
}
|
||||
}
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
|
||||
p_begin += p_inc*dt_p_size;
|
||||
}
|
||||
|
||||
|
||||
// The original idea here was to read the packm_ukr from the context
|
||||
// if it is non-NULL. The problem is, it requires that we be able to
|
||||
// assume that the packm_ukr field is initialized to NULL, which it
|
||||
// currently is not.
|
||||
|
||||
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
|
||||
|
||||
//if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) )
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// NULL (which is the default value after the context is created),
|
||||
// we use the default lookup table to determine the right func_t
|
||||
// for the current schema.
|
||||
const dim_t i = bli_pack_schema_index( schema );
|
||||
|
||||
packm_kers = &packm_struc_cxk_kers[ i ];
|
||||
}
|
||||
#if 0
|
||||
else // cntx's packm func_t overrides
|
||||
{
|
||||
// If the packm structure-aware kernel func_t in the context is
|
||||
// non-NULL (ie: assumed to be valid), we use that instead.
|
||||
//packm_kers = bli_cntx_packm_ukrs( cntx );
|
||||
packm_kers = cntx_packm_kers;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Query the datatype-specific function pointer from the func_t object.
|
||||
packm_ker = bli_func_get_dt( dt_p, packm_kers );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_p];
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
schema,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
is_p,
|
||||
pd_p, ps_p,
|
||||
packm_ker,
|
||||
cntx,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
bool revifup, \
|
||||
bool reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
void_fp packm_ker, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
|
||||
\
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool row_stored; \
|
||||
bool col_stored; \
|
||||
inc_t is_p_use; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_negate_diag_offset( &diagoffc ); \
|
||||
bli_toggle_uplo( &uploc ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t )panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (n_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = n_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_struc_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc_i < 0 ) || \
|
||||
( row_stored && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
|
||||
panel_len_max ); \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
/* We need to re-compute the imaginary stride as a function of
|
||||
panel_len_max_i since triangular packed matrices have panels
|
||||
of varying lengths. NOTE: This imaginary stride value is
|
||||
only referenced by the packm kernels for induced methods. */ \
|
||||
is_p_use = ldp * panel_len_max_i; \
|
||||
\
|
||||
/* We nudge the imaginary stride up by one if it is odd. */ \
|
||||
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
|
||||
\
|
||||
/* NOTE: We MUST use round-robin partitioning when packing
|
||||
micropanels of a triangular matrix. Hermitian/symmetric
|
||||
and general packing may use slab or round-robin, depending
|
||||
on which was selected at configure-time. */ \
|
||||
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is usually LESS than ps_p because triangular
|
||||
matrices usually have several micro-panels that are shorter
|
||||
than a "full" micro-panel. */ \
|
||||
p_inc = is_p_use; \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( strucc, \
|
||||
diagoffc_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
is_p_use = is_p; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
packm_ker_cast( BLIS_GENERAL, \
|
||||
0, \
|
||||
diagc, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p_use, \
|
||||
cntx ); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
|
||||
|
||||
|
||||
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_barrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_barrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
|
||||
59
frame/1m/packm/bli_packm_blk_var1.h
Normal file
59
frame/1m/packm/bli_packm_blk_var1.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// packm params types.
|
||||
//
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Type of C Type of P
|
||||
packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
|
||||
} packm_blk_var1_params_t;
|
||||
|
||||
//
|
||||
// Prototype object-based interfaces.
|
||||
//
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_packm_blk_var1
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
);
|
||||
|
||||
@@ -1,344 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
trans_t transc,
|
||||
pack_t schema,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
inc_t is_p,
|
||||
dim_t pd_p, inc_t ps_p,
|
||||
cntx_t* cntx,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
|
||||
|
||||
|
||||
void bli_packm_blk_var1_md
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
)
|
||||
{
|
||||
num_t dt_c = bli_obj_dt( c );
|
||||
num_t dt_p = bli_obj_dt( p );
|
||||
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
inc_t is_p = bli_obj_imag_stride( p );
|
||||
dim_t pd_p = bli_obj_panel_dim( p );
|
||||
inc_t ps_p = bli_obj_panel_stride( p );
|
||||
|
||||
obj_t kappa;
|
||||
void* buf_kappa;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// Treatment of kappa (ie: packing during scaling) depends on
|
||||
// whether we are executing an induced method.
|
||||
if ( bli_is_nat_packed( schema ) )
|
||||
{
|
||||
// This branch is for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
|
||||
}
|
||||
else // if ( bli_is_ind_packed( schema ) )
|
||||
{
|
||||
obj_t* kappa_p;
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
|
||||
}
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_c][dt_p];
|
||||
|
||||
// Invoke the function.
|
||||
f(
|
||||
transc,
|
||||
schema,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
is_p,
|
||||
pd_p, ps_p,
|
||||
cntx,
|
||||
t );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC2
|
||||
#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chc,chp,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
ctype_p* restrict kappa_cast = kappa; \
|
||||
ctype_c* restrict c_cast = c; \
|
||||
ctype_p* restrict p_cast = p; \
|
||||
ctype_c* restrict c_begin; \
|
||||
ctype_p* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
inc_t vs_c; \
|
||||
inc_t p_inc; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
bool row_stored; \
|
||||
bool col_stored; \
|
||||
\
|
||||
ctype_c* restrict c_use; \
|
||||
ctype_p* restrict p_use; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
\
|
||||
( void )col_stored; \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = cs_c; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = rs_c; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
{ \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress unused variable warnings when slab partitioning is enabled,
|
||||
since the slab-based definition of bli_packm_my_iter() does not
|
||||
actually use tid or nt. */ \
|
||||
( void )nt; ( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
for ( ic = 0, ip = 0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
{ \
|
||||
c_use = c_begin; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
PASTEMAC2(chc,chp,packm_struc_cxk_md) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p, \
|
||||
is_p, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
|
||||
INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
|
||||
|
||||
#endif
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var1_md
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* t
|
||||
);
|
||||
|
||||
|
||||
#undef GENTPROT2
|
||||
#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
|
||||
\
|
||||
void PASTEMAC2(chc,chp,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t is_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* cntx, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC0( packm_blk_var1_md )
|
||||
INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md )
|
||||
|
||||
@@ -35,11 +35,10 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
cntl_t* bli_packm_cntl_create_node
|
||||
BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
|
||||
(
|
||||
rntm_t* rntm,
|
||||
void_fp var_func,
|
||||
void_fp packm_var_func,
|
||||
bszid_t bmid_m,
|
||||
bszid_t bmid_n,
|
||||
bool does_invert_diag,
|
||||
@@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node
|
||||
|
||||
// Initialize the packm_params_t struct.
|
||||
params->size = sizeof( packm_params_t );
|
||||
params->var_func = packm_var_func;
|
||||
params->bmid_m = bmid_m;
|
||||
params->bmid_n = bmid_n;
|
||||
params->does_invert_diag = does_invert_diag;
|
||||
|
||||
@@ -36,7 +36,6 @@
|
||||
struct packm_params_s
|
||||
{
|
||||
uint64_t size; // size field must be present and come first.
|
||||
packm_var_oft var_func;
|
||||
bszid_t bmid_m;
|
||||
bszid_t bmid_n;
|
||||
bool does_invert_diag;
|
||||
@@ -47,11 +46,6 @@ struct packm_params_s
|
||||
};
|
||||
typedef struct packm_params_s packm_params_t;
|
||||
|
||||
BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl )
|
||||
{
|
||||
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func;
|
||||
}
|
||||
|
||||
BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
|
||||
{
|
||||
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
|
||||
@@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node
|
||||
(
|
||||
rntm_t* rntm,
|
||||
void_fp var_func,
|
||||
void_fp packm_var_func,
|
||||
bszid_t bmid_m,
|
||||
bszid_t bmid_n,
|
||||
bool does_invert_diag,
|
||||
|
||||
@@ -35,12 +35,14 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
siz_t bli_packm_init
|
||||
bool bli_packm_init
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
@@ -51,139 +53,27 @@ siz_t bli_packm_init
|
||||
// suitable block of memory from the memory allocator (if such a block
|
||||
// of memory has not already been allocated previously).
|
||||
|
||||
bszid_t bmult_id_m;
|
||||
bszid_t bmult_id_n;
|
||||
bool does_invert_diag;
|
||||
bool rev_iter_if_upper;
|
||||
bool rev_iter_if_lower;
|
||||
pack_t schema;
|
||||
//packbuf_t pack_buf_type;
|
||||
siz_t size_needed;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_packm_init_check( a, p, cntx );
|
||||
bli_packm_init_check( c, p, cntx );
|
||||
|
||||
// Extract various fields from the control tree.
|
||||
bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
|
||||
bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
|
||||
does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl );
|
||||
rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl );
|
||||
rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl );
|
||||
schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
//pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
|
||||
|
||||
#if 0
|
||||
// Let us now check to see if the object has already been packed. First
|
||||
// we check if it has been packed to an unspecified (row or column)
|
||||
// format, in which case we can alias the object and return.
|
||||
// NOTE: The reason we don't need to even look at the control tree in
|
||||
// this case is as follows: an object's pack status is only set to
|
||||
// BLIS_PACKED_UNSPEC for situations when the actual format used is
|
||||
// not important, as long as its packed into contiguous rows or
|
||||
// contiguous columns. A good example of this is packing for matrix
|
||||
// operands in the level-2 operations.
|
||||
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
|
||||
{
|
||||
bli_obj_alias_to( a, p );
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Now we check if the object has already been packed to the desired
|
||||
// schema (as encoded in the control tree). If so, we can alias and
|
||||
// return 0.
|
||||
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
|
||||
// and thus packing will be called for (but in some cases packing has
|
||||
// already taken place, or does not need to take place, and so that will
|
||||
// be indicated by the pack status). Also, not all combinations of
|
||||
// current pack status and desired pack schema are valid.
|
||||
if ( bli_obj_pack_schema( a ) == pack_schema )
|
||||
{
|
||||
bli_obj_alias_to( a, p );
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
// We begin by copying the fields of A.
|
||||
bli_obj_alias_to( c, p );
|
||||
|
||||
// If the object is marked as being filled with zeros, then we can skip
|
||||
// the packm operation entirely and alias.
|
||||
if ( bli_obj_is_zeros( a ) )
|
||||
{
|
||||
bli_obj_alias_to( a, p );
|
||||
return 0;
|
||||
}
|
||||
if ( bli_obj_is_zeros( c ) )
|
||||
return false;
|
||||
|
||||
// Prepare a few other variables based on properties of the control
|
||||
// tree.
|
||||
|
||||
invdiag_t invert_diag;
|
||||
packord_t pack_ord_if_up;
|
||||
packord_t pack_ord_if_lo;
|
||||
|
||||
if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG;
|
||||
else invert_diag = BLIS_NO_INVERT_DIAG;
|
||||
|
||||
if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER;
|
||||
else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER;
|
||||
|
||||
if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER;
|
||||
else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER;
|
||||
|
||||
// Initialize object p for the final packed matrix.
|
||||
size_needed
|
||||
=
|
||||
bli_packm_init_pack
|
||||
(
|
||||
invert_diag,
|
||||
schema,
|
||||
pack_ord_if_up,
|
||||
pack_ord_if_lo,
|
||||
bmult_id_m,
|
||||
bmult_id_n,
|
||||
a,
|
||||
p,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Return the size needed for memory allocation of the packed buffer.
|
||||
return size_needed;
|
||||
}
|
||||
|
||||
|
||||
siz_t bli_packm_init_pack
|
||||
(
|
||||
invdiag_t invert_diag,
|
||||
pack_t schema,
|
||||
packord_t pack_ord_if_up,
|
||||
packord_t pack_ord_if_lo,
|
||||
bszid_t bmult_id_m,
|
||||
bszid_t bmult_id_n,
|
||||
obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
num_t dt_tar = bli_obj_target_dt( a );
|
||||
num_t dt_scalar = bli_obj_scalar_dt( a );
|
||||
trans_t transa = bli_obj_onlytrans_status( a );
|
||||
dim_t m_a = bli_obj_length( a );
|
||||
dim_t n_a = bli_obj_width( a );
|
||||
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
|
||||
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
|
||||
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
|
||||
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx );
|
||||
|
||||
dim_t m_p, n_p;
|
||||
dim_t m_p_pad, n_p_pad;
|
||||
siz_t size_p;
|
||||
siz_t elem_size_p;
|
||||
inc_t rs_p, cs_p;
|
||||
inc_t is_p;
|
||||
|
||||
|
||||
// We begin by copying the fields of A.
|
||||
bli_obj_alias_to( a, p );
|
||||
// Extract various fields from the control tree.
|
||||
bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
|
||||
bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
|
||||
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
|
||||
num_t dt_tar = bli_obj_target_dt( c );
|
||||
num_t dt_scalar = bli_obj_scalar_dt( c );
|
||||
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
|
||||
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
|
||||
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
|
||||
|
||||
// Typecast the internal scalar value to the target datatype.
|
||||
// Note that if the typecasting is needed, this must happen BEFORE we
|
||||
@@ -195,51 +85,21 @@ siz_t bli_packm_init_pack
|
||||
|
||||
// Update the storage datatype of P to be the target datatype of A.
|
||||
bli_obj_set_dt( dt_tar, p );
|
||||
bli_obj_set_elem_size( bli_dt_size( dt_tar ), p );
|
||||
|
||||
// Update the dimension fields to explicitly reflect a transposition,
|
||||
// if needed.
|
||||
// Then, clear the conjugation and transposition fields from the object
|
||||
// since matrix packing in BLIS is deemed to take care of all conjugation
|
||||
// and transposition necessary.
|
||||
// Then, we adjust the properties of P when A needs a transposition.
|
||||
// We negate the diagonal offset, and if A is upper- or lower-stored,
|
||||
// we either toggle the uplo of P.
|
||||
// Finally, if we mark P as dense since we assume that all matrices,
|
||||
// regardless of structure, will be densified.
|
||||
bli_obj_set_dims_with_trans( transa, m_a, n_a, p );
|
||||
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p );
|
||||
if ( bli_does_trans( transa ) )
|
||||
{
|
||||
bli_obj_negate_diag_offset( p );
|
||||
if ( bli_obj_is_upper_or_lower( a ) )
|
||||
bli_obj_toggle_uplo( p );
|
||||
}
|
||||
// Store the pack schema to the object.
|
||||
bli_obj_set_pack_schema( schema, p );
|
||||
|
||||
// If we are packing micropanels, mark P as dense. Otherwise, we are
|
||||
// probably being called in the context of a level-2 operation, in
|
||||
// which case we do not want to overwrite the uplo field of P (inherited
|
||||
// from A) with BLIS_DENSE because that information may be needed by
|
||||
// the level-2 operation's unblocked variant to decide whether to
|
||||
// execute a "lower" or "upper" branch of code.
|
||||
if ( bli_is_panel_packed( schema ) )
|
||||
{
|
||||
bli_obj_set_uplo( BLIS_DENSE, p );
|
||||
}
|
||||
// Clear the conjugation field from the object since matrix packing
|
||||
// in BLIS is deemed to take care of all conjugation necessary.
|
||||
bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
|
||||
|
||||
// Since we are packing micropanels, mark P as dense.
|
||||
bli_obj_set_uplo( BLIS_DENSE, p );
|
||||
|
||||
// Reset the view offsets to (0,0).
|
||||
bli_obj_set_offs( 0, 0, p );
|
||||
|
||||
// Set the invert diagonal field.
|
||||
bli_obj_set_invert_diag( invert_diag, p );
|
||||
|
||||
// Set the pack status of P to the pack schema prescribed in the control
|
||||
// tree node.
|
||||
bli_obj_set_pack_schema( schema, p );
|
||||
|
||||
// Set the packing order bits.
|
||||
bli_obj_set_pack_order_if_upper( pack_ord_if_up, p );
|
||||
bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p );
|
||||
|
||||
// Compute the dimensions padded by the dimension multiples. These
|
||||
// dimensions will be the dimensions of the packed matrices, including
|
||||
// zero-padding, and will be used by the macro- and micro-kernels.
|
||||
@@ -247,10 +107,10 @@ siz_t bli_packm_init_pack
|
||||
// in P) and aligning them to the dimension multiples (typically equal
|
||||
// to register blocksizes). This does waste a little bit of space for
|
||||
// level-2 operations, but that's okay with us.
|
||||
m_p = bli_obj_length( p );
|
||||
n_p = bli_obj_width( p );
|
||||
m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
|
||||
n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
|
||||
dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
|
||||
|
||||
// Save the padded dimensions into the packed object. It is important
|
||||
// to save these dimensions since they represent the actual dimensions
|
||||
@@ -258,177 +118,70 @@ siz_t bli_packm_init_pack
|
||||
bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
|
||||
|
||||
// Now we prepare to compute strides, align them, and compute the
|
||||
// total number of bytes needed for the packed buffer. The caller
|
||||
// will then use that value to acquire an appropriate block of memory
|
||||
// from the memory allocator.
|
||||
// total number of bytes needed for the packed buffer. Then we use
|
||||
// that value to acquire an appropriate block of memory from the
|
||||
// memory allocator.
|
||||
|
||||
// Extract the element size for the packed object.
|
||||
elem_size_p = bli_obj_elem_size( p );
|
||||
siz_t elem_size_p = bli_obj_elem_size( p );
|
||||
|
||||
// Set the row and column strides of p based on the pack schema.
|
||||
if ( bli_is_row_packed( schema ) &&
|
||||
!bli_is_panel_packed( schema ) )
|
||||
{
|
||||
// For regular row storage, the padded width of our matrix
|
||||
// should be used for the row stride, with the column stride set
|
||||
// to one. By using the WIDTH of the mem_t region, we allow for
|
||||
// zero-padding (if necessary/desired) along the right edge of
|
||||
// the matrix.
|
||||
rs_p = n_p_pad;
|
||||
cs_p = 1;
|
||||
// The panel dimension (for each datatype) should be equal to the
|
||||
// default (logical) blocksize multiple in the m dimension.
|
||||
dim_t m_panel = bmult_m_def;
|
||||
|
||||
// Align the leading dimension according to the heap stride
|
||||
// alignment size so that the second, third, etc rows begin at
|
||||
// aligned addresses.
|
||||
rs_p = bli_align_dim_to_size( rs_p, elem_size_p,
|
||||
BLIS_HEAP_STRIDE_ALIGN_SIZE );
|
||||
// The "column stride" of a row-micropanel packed object is interpreted
|
||||
// as the column stride WITHIN a micropanel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple, which may be equal to the
|
||||
// default (logical) blocksize multiple).
|
||||
inc_t cs_p = bmult_m_pack;
|
||||
|
||||
// Store the strides in P.
|
||||
bli_obj_set_strides( rs_p, cs_p, p );
|
||||
// The "row stride" of a row-micropanel packed object is interpreted
|
||||
// as the row stride WITHIN a micropanel. Thus, it is unit.
|
||||
inc_t rs_p = 1;
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = m_p_pad * rs_p * elem_size_p;
|
||||
}
|
||||
else if ( bli_is_col_packed( schema ) &&
|
||||
!bli_is_panel_packed( schema ) )
|
||||
{
|
||||
// For regular column storage, the padded length of our matrix
|
||||
// should be used for the column stride, with the row stride set
|
||||
// to one. By using the LENGTH of the mem_t region, we allow for
|
||||
// zero-padding (if necessary/desired) along the bottom edge of
|
||||
// the matrix.
|
||||
cs_p = m_p_pad;
|
||||
rs_p = 1;
|
||||
// The "panel stride" of a micropanel packed object is interpreted as
|
||||
// the distance between the (0,0) element of panel k and the (0,0)
|
||||
// element of panel k+1. We use the padded width computed above to
|
||||
// allow for zero-padding (if necessary/desired) along the far end
|
||||
// of each micropanel (ie: the right edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last micropanel if the m
|
||||
// dimension of the matrix is not a whole multiple of MR.
|
||||
inc_t ps_p = cs_p * n_p_pad;
|
||||
|
||||
// Align the leading dimension according to the heap stride
|
||||
// alignment size so that the second, third, etc columns begin at
|
||||
// aligned addresses.
|
||||
cs_p = bli_align_dim_to_size( cs_p, elem_size_p,
|
||||
BLIS_HEAP_STRIDE_ALIGN_SIZE );
|
||||
// As a general rule, we don't want micropanel strides to be odd. There
|
||||
// are very few instances where this can happen, but we've seen it happen
|
||||
// more than zero times (such as for certain small problems), and so we
|
||||
// check for it here.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Store the strides in P.
|
||||
bli_obj_set_strides( rs_p, cs_p, p );
|
||||
// Set the imaginary stride (in units of fundamental elements).
|
||||
// This is the number of real elements that must be traversed before
|
||||
// reaching the imaginary part of the packed micropanel. NOTE: the
|
||||
// imaginary stride is mostly vestigial and left over from the 3m
|
||||
// and 4m implementations.
|
||||
inc_t is_p = 1;
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = cs_p * n_p_pad * elem_size_p;
|
||||
}
|
||||
else if ( bli_is_row_packed( schema ) &&
|
||||
bli_is_panel_packed( schema ) )
|
||||
{
|
||||
dim_t m_panel;
|
||||
dim_t ps_p;
|
||||
// Store the strides and panel dimension in P.
|
||||
bli_obj_set_strides( rs_p, cs_p, p );
|
||||
bli_obj_set_imag_stride( is_p, p );
|
||||
bli_obj_set_panel_dim( m_panel, p );
|
||||
bli_obj_set_panel_stride( ps_p, p );
|
||||
bli_obj_set_panel_length( m_panel, p );
|
||||
bli_obj_set_panel_width( n_p, p );
|
||||
|
||||
// The panel dimension (for each datatype) should be equal to the
|
||||
// default (logical) blocksize multiple in the m dimension.
|
||||
m_panel = bmult_m_def;
|
||||
// Compute the size of the packed buffer.
|
||||
siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
|
||||
|
||||
// The "column stride" of a row-micropanel packed object is interpreted
|
||||
// as the column stride WITHIN a micropanel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple, which may be equal to the
|
||||
// default (logical) blocksize multiple).
|
||||
cs_p = bmult_m_pack;
|
||||
// If the requested size is zero, then we don't need to do any allocation.
|
||||
if ( size_p == 0 )
|
||||
return false;
|
||||
|
||||
// The "row stride" of a row-micropanel packed object is interpreted
|
||||
// as the row stride WITHIN a micropanel. Thus, it is unit.
|
||||
rs_p = 1;
|
||||
// Update the buffer address in p to point to the buffer associated
|
||||
// with the mem_t entry acquired from the memory broker (now cached in
|
||||
// the control tree node).
|
||||
void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
|
||||
bli_obj_set_buffer( buffer, p );
|
||||
|
||||
// The "panel stride" of a micropanel packed object is interpreted as
|
||||
// the distance between the (0,0) element of panel k and the (0,0)
|
||||
// element of panel k+1. We use the padded width computed above to
|
||||
// allow for zero-padding (if necessary/desired) along the far end
|
||||
// of each micropanel (ie: the right edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last micropanel if the m
|
||||
// dimension of the matrix is not a whole multiple of MR.
|
||||
ps_p = cs_p * n_p_pad;
|
||||
|
||||
// As a general rule, we don't want micropanel strides to be odd.
|
||||
// NOTE: This safety feature *may* not be necessary anymore, but was
|
||||
// definitely needed to support certain variations of the 3m method.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Set the imaginary stride (in units of fundamental elements).
|
||||
// This is the number of real elements that must be traversed before
|
||||
// reaching the imaginary part of the packed micropanel. NOTE: the
|
||||
// imaginary stride is mostly vestigial and left over from the 3m
|
||||
// and 4m implementations.
|
||||
is_p = 1;
|
||||
|
||||
// Store the strides and panel dimension in P.
|
||||
bli_obj_set_strides( rs_p, cs_p, p );
|
||||
bli_obj_set_imag_stride( is_p, p );
|
||||
bli_obj_set_panel_dim( m_panel, p );
|
||||
bli_obj_set_panel_stride( ps_p, p );
|
||||
bli_obj_set_panel_length( m_panel, p );
|
||||
bli_obj_set_panel_width( n_p, p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
|
||||
}
|
||||
else if ( bli_is_col_packed( schema ) &&
|
||||
bli_is_panel_packed( schema ) )
|
||||
{
|
||||
dim_t n_panel;
|
||||
dim_t ps_p;
|
||||
|
||||
// The panel dimension (for each datatype) should be equal to the
|
||||
// default (logical) blocksize multiple in the n dimension.
|
||||
n_panel = bmult_n_def;
|
||||
|
||||
// The "row stride" of a column-micropanel packed object is interpreted
|
||||
// as the row stride WITHIN a micropanel. Thus, this is equal to the
|
||||
// packing (storage) blocksize multiple (which may be equal to the
|
||||
// default (logical) blocksize multiple.
|
||||
rs_p = bmult_n_pack;
|
||||
|
||||
// The "column stride" of a column-micropanel packed object is
|
||||
// interpreted as the column stride WITHIN a micropanel. Thus, it is
|
||||
// unit.
|
||||
cs_p = 1;
|
||||
|
||||
// The "panel stride" of a micropanel packed object is interpreted as
|
||||
// the distance between the (0,0) element of panel k and the (0,0)
|
||||
// element of panel k+1. We use the padded length computed above to
|
||||
// allow for zero-padding (if necessary/desired) along the far end
|
||||
// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
|
||||
// can also occur along the long edge of the last micropanel if the n
|
||||
// dimension of the matrix is not a whole multiple of NR.
|
||||
ps_p = m_p_pad * rs_p;
|
||||
|
||||
// As a general rule, we don't want micropanel strides to be odd.
|
||||
// NOTE: This safety feature *may* not be necessary anymore, but was
|
||||
// definitely needed to support certain variations of the 3m method.
|
||||
if ( bli_is_odd( ps_p ) ) ps_p += 1;
|
||||
|
||||
// Set the imaginary stride (in units of fundamental elements).
|
||||
// This is the number of real elements that must be traversed before
|
||||
// reaching the imaginary part of the packed micropanel. NOTE: the
|
||||
// imaginary stride is mostly vestigial and left over from the 3m
|
||||
// and 4m implementations.
|
||||
is_p = 1;
|
||||
|
||||
// Store the strides and panel dimension in P.
|
||||
bli_obj_set_strides( rs_p, cs_p, p );
|
||||
bli_obj_set_imag_stride( is_p, p );
|
||||
bli_obj_set_panel_dim( n_panel, p );
|
||||
bli_obj_set_panel_stride( ps_p, p );
|
||||
bli_obj_set_panel_length( m_p, p );
|
||||
bli_obj_set_panel_width( n_panel, p );
|
||||
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p;
|
||||
}
|
||||
else
|
||||
{
|
||||
// NOTE: When implementing block storage, we only need to implement
|
||||
// the following two cases:
|
||||
// - row-stored blocks in row-major order
|
||||
// - column-stored blocks in column-major order
|
||||
// The other two combinations coincide with that of packed row-panel
|
||||
// and packed column- panel storage.
|
||||
|
||||
size_p = 0;
|
||||
}
|
||||
|
||||
return size_p;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -32,24 +32,13 @@
|
||||
|
||||
*/
|
||||
|
||||
siz_t bli_packm_init
|
||||
BLIS_EXPORT_BLIS bool bli_packm_init
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
BLIS_EXPORT_BLIS siz_t bli_packm_init_pack
|
||||
(
|
||||
invdiag_t invert_diag,
|
||||
pack_t schema,
|
||||
packord_t pack_ord_if_up,
|
||||
packord_t pack_ord_if_lo,
|
||||
bszid_t bmult_id_m,
|
||||
bszid_t bmult_id_n,
|
||||
obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
|
||||
@@ -39,59 +39,19 @@ void bli_packm_int
|
||||
obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
bli_init_once();
|
||||
|
||||
packm_var_oft f;
|
||||
// Extract the function pointer from the object.
|
||||
packm_var_oft f = bli_obj_pack_fn( a );
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_packm_int_check( a, p, cntx );
|
||||
|
||||
// Sanity check; A should never have a zero dimension. If we must support
|
||||
// it, then we should fold it into the next alias-and-early-exit block.
|
||||
//if ( bli_obj_has_zero_dim( a ) ) bli_abort();
|
||||
|
||||
// Let us now check to see if the object has already been packed. First
|
||||
// we check if it has been packed to an unspecified (row or column)
|
||||
// format, in which case we can return, since by now aliasing has already
|
||||
// taken place in packm_init().
|
||||
// NOTE: The reason we don't need to even look at the control tree in
|
||||
// this case is as follows: an object's pack status is only set to
|
||||
// BLIS_PACKED_UNSPEC for situations when the actual format used is
|
||||
// not important, as long as its packed into contiguous rows or
|
||||
// contiguous columns. A good example of this is packing for matrix
|
||||
// operands in the level-2 operations.
|
||||
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// At this point, we can be assured that cntl is not NULL. Now we check
|
||||
// if the object has already been packed to the desired schema (as en-
|
||||
// coded in the control tree). If so, we can return, as above.
|
||||
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
|
||||
// and thus packing will be called for (but in some cases packing has
|
||||
// already taken place, or does not need to take place, and so that will
|
||||
// be indicated by the pack status). Also, not all combinations of
|
||||
// current pack status and desired pack schema are valid.
|
||||
if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If the object is marked as being filled with zeros, then we can skip
|
||||
// the packm operation entirely.
|
||||
if ( bli_obj_is_zeros( a ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract the function pointer from the current control tree node.
|
||||
f = bli_cntl_packm_params_var_func( cntl );
|
||||
// Barrier so that we know threads are done with previous computation
|
||||
// with the same packing buffer before starting to pack.
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Invoke the variant with kappa_use.
|
||||
f
|
||||
@@ -99,8 +59,12 @@ void bli_packm_int
|
||||
a,
|
||||
p,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ void bli_packm_int
|
||||
obj_t* a,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -34,83 +35,42 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_trsm_packa
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
void* bli_packm_scalar( obj_t* kappa, obj_t* p )
|
||||
{
|
||||
obj_t a_pack;
|
||||
num_t dt_p = bli_obj_dt( p );
|
||||
pack_t schema = bli_obj_pack_schema( p );
|
||||
|
||||
// Pack matrix A according to the control tree node.
|
||||
bli_l3_packm
|
||||
(
|
||||
a,
|
||||
&a_pack,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
// The value for kappa we use will depends on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing induced complex domain algorithms in terms of
|
||||
// real domain micro-kernels. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) &&
|
||||
!bli_is_nat_packed( schema ) )
|
||||
{
|
||||
//printf( "applying non-zero imag kappa\n_p" );
|
||||
|
||||
// Proceed with execution using packed matrix A.
|
||||
bli_trsm_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a_pack,
|
||||
b,
|
||||
&BLIS_ONE,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
bli_cntl_sub_node( cntl ),
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_trsm_packb
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t b_pack;
|
||||
|
||||
// Pack matrix B according to the control tree node.
|
||||
bli_l3_packm
|
||||
(
|
||||
b,
|
||||
&b_pack,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
|
||||
// Proceed with execution using packed matrix B.
|
||||
bli_trsm_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
a,
|
||||
&b_pack,
|
||||
&BLIS_ONE,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
bli_cntl_sub_node( cntl ),
|
||||
bli_thrinfo_sub_node( thread )
|
||||
);
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
return bli_obj_buffer_for_1x1( dt_p, kappa );
|
||||
}
|
||||
// This branch is also for native execution, where we assume that
|
||||
// the micro-kernel will always apply the alpha scalar of the
|
||||
// higher-level operation. Thus, we use BLIS_ONE for kappa so
|
||||
// that the underlying packm implementation does not perform
|
||||
// any scaling during packing.
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,5 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_packm_blk_var1_md.h"
|
||||
#include "bli_packm_struc_cxk_md.h"
|
||||
BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p );
|
||||
|
||||
@@ -40,57 +40,24 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* Determine the dimensions and relative strides of the micro-panel
|
||||
based on its pack schema. */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_dim_max = n_panel_max; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_dim_max = m_panel_max; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
@@ -118,23 +85,21 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_herm_cxk) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
invdiag, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
panel_dim_max, \
|
||||
panel_len_max, \
|
||||
panel_dim_off, \
|
||||
panel_len_off, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
is_p, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
@@ -145,130 +110,24 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_tri_cxk) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
panel_dim_max, \
|
||||
panel_len_max, \
|
||||
panel_dim_off, \
|
||||
panel_len_off, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
is_p, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
|
||||
fill the edge region (the bottom m_panel_max - m_panel rows or right-
|
||||
side n_panel_max - n_panel columns) of the micropanel with zeros.
|
||||
However, this responsibility has been moved to the packm microkernel.
|
||||
This change allows experts to use custom kernels that pack to custom
|
||||
packing formats when the problem size is not a nice multiple of the
|
||||
register blocksize. */ \
|
||||
\
|
||||
/*
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_br, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
else if ( bli_is_row_packed( schema ) ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
|
||||
@@ -282,42 +141,31 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
bool invdiag, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t i, j; \
|
||||
bool row_stored; \
|
||||
bool col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t i, j; \
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
@@ -325,10 +173,10 @@ void PASTEMAC(ch,varname) \
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
c = c + diagoffc * ( doff_t )ldc + \
|
||||
-diagoffc * ( doff_t )incc; \
|
||||
bli_swap_incs( &incc, &ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
@@ -350,7 +198,7 @@ void PASTEMAC(ch,varname) \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
|
||||
{ \
|
||||
ctype* restrict c10; \
|
||||
ctype* restrict p10; \
|
||||
@@ -370,14 +218,12 @@ void PASTEMAC(ch,varname) \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
if ( diagoffc < 0 ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
if ( bli_is_lower( uploc ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
@@ -393,8 +239,8 @@ void PASTEMAC(ch,varname) \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
|
||||
-diagoffc12 * ( doff_t )incc; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
@@ -402,16 +248,15 @@ void PASTEMAC(ch,varname) \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( &conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( bli_is_upper( uploc ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
|
||||
-diagoffc10 * ( doff_t )incc; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
@@ -486,8 +331,8 @@ void PASTEMAC(ch,varname) \
|
||||
transc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p, cs_p, \
|
||||
c11, incc, ldc, \
|
||||
p11, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
@@ -503,7 +348,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *pi11 ); \
|
||||
\
|
||||
pi11 += rs_p + cs_p; \
|
||||
pi11 += 1 + ldp; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
@@ -519,7 +364,7 @@ void PASTEMAC(ch,varname) \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa, \
|
||||
p11, rs_p, cs_p, \
|
||||
p11, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
@@ -539,28 +384,26 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,kername) \
|
||||
( \
|
||||
@@ -584,11 +427,11 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
diagoffc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
p, rs_p, cs_p, \
|
||||
p, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
@@ -599,10 +442,10 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p, rs_p, cs_p, \
|
||||
diagoffc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
p, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
@@ -621,23 +464,53 @@ void PASTEMAC(ch,varname) \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( &uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
|
||||
\
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
diagoffp, \
|
||||
diagoffc, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
zero, \
|
||||
p, rs_p, cs_p, \
|
||||
p, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( panel_dim != panel_dim_max && \
|
||||
panel_len != panel_len_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t i = panel_dim; \
|
||||
dim_t j = panel_len; \
|
||||
dim_t m_br = panel_dim_max - i; \
|
||||
dim_t n_br = panel_len_max - j; \
|
||||
ctype* p_br = p + (i ) + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_br, 1, ldp, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
|
||||
|
||||
@@ -38,84 +38,25 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_struc_cxk )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_herm_cxk )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_tri_cxk )
|
||||
|
||||
|
||||
@@ -40,57 +40,25 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* Determine the dimensions and relative strides of the micro-panel
|
||||
based on its pack schema. */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_dim_max = n_panel_max; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_dim_max = m_panel_max; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Handle micro-panel packing based on the structure of the matrix
|
||||
being packed. */ \
|
||||
if ( bli_is_general( strucc ) ) \
|
||||
@@ -108,7 +76,7 @@ void PASTEMAC(ch,varname) \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
cntx \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
@@ -118,24 +86,23 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_herm_cxk_1er) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
invdiag, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
panel_dim_max, \
|
||||
panel_len_max, \
|
||||
panel_dim_off, \
|
||||
panel_len_off, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
cntx \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
is_p, \
|
||||
cntx, \
|
||||
params \
|
||||
); \
|
||||
} \
|
||||
else /* ( bli_is_triangular( strucc ) ) */ \
|
||||
@@ -145,125 +112,25 @@ void PASTEMAC(ch,varname) \
|
||||
PASTEMAC(ch,packm_tri_cxk_1er) \
|
||||
( \
|
||||
strucc, \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
schema, \
|
||||
invdiag, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
m_panel_max, \
|
||||
n_panel_max, \
|
||||
panel_dim, \
|
||||
panel_dim_max, \
|
||||
panel_len, \
|
||||
panel_dim_max, \
|
||||
panel_len_max, \
|
||||
panel_dim_off, \
|
||||
panel_len_off, \
|
||||
kappa, \
|
||||
c, rs_c, cs_c, \
|
||||
incc, ldc, \
|
||||
p, rs_p, cs_p, \
|
||||
ldp, \
|
||||
cntx \
|
||||
c, incc, ldc, \
|
||||
p, ldp, \
|
||||
is_p, \
|
||||
cntx, \
|
||||
params \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
|
||||
fill the edge region (the bottom m_panel_max - m_panel rows or right-
|
||||
side n_panel_max - n_panel columns) of the micropanel with zeros.
|
||||
However, this responsibility has been moved to the packm microkernel.
|
||||
This change allows experts to use custom kernels that pack to custom
|
||||
packing formats when the problem size is not a nice multiple of the
|
||||
register blocksize. */ \
|
||||
/*
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t offm = m_panel; \
|
||||
dim_t offn = 0; \
|
||||
dim_t m_edge = m_panel_max - m_panel; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
dim_t offm = 0; \
|
||||
dim_t offn = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - n_panel; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) ) \
|
||||
{ \
|
||||
/* If this micro-panel is an edge case in both panel dimension and
|
||||
length, then it must be a bottom-right corner case, which
|
||||
typically only happens for micro-panels being packed for trsm.
|
||||
(It also happens for trmm if kr > 1.) Here, we set the part of
|
||||
the diagonal that extends into the zero-padded region to
|
||||
identity. This prevents NaNs and Infs from creeping into the
|
||||
computation. If this code does execute for trmm, it is okay,
|
||||
because those 1.0's that extend into the bottom-right region
|
||||
end up getting muliplied by the 0.0's in the zero-padded region
|
||||
of the other matrix. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t offm = m_panel; \
|
||||
dim_t offn = n_panel; \
|
||||
dim_t m_edge = m_panel_max - m_panel; \
|
||||
dim_t n_edge = n_panel_max - n_panel; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
one, \
|
||||
p, rs_p, cs_p, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/*
|
||||
if ( bli_is_1r_packed( schema ) ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \
|
||||
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
\
|
||||
if ( bli_is_1e_packed( schema ) ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \
|
||||
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
|
||||
@@ -277,42 +144,32 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
bool invdiag, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t j; \
|
||||
bool row_stored; \
|
||||
bool col_stored; \
|
||||
\
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
row_stored = bli_is_col_packed( schema ); \
|
||||
col_stored = bli_is_row_packed( schema ); \
|
||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t j; \
|
||||
\
|
||||
/* Handle the case where the micro-panel does NOT intersect the
|
||||
diagonal separately from the case where it does intersect. */ \
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
@@ -320,10 +177,10 @@ void PASTEMAC(ch,varname) \
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
c = c + diagoffc * ( doff_t )ldc + \
|
||||
-diagoffc * ( doff_t )incc; \
|
||||
bli_swap_incs( &incc, &ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
@@ -345,7 +202,7 @@ void PASTEMAC(ch,varname) \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
|
||||
{ \
|
||||
ctype* restrict c10; \
|
||||
ctype* restrict p10; \
|
||||
@@ -366,14 +223,12 @@ void PASTEMAC(ch,varname) \
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( col_stored && diagoffc < 0 ) || \
|
||||
( row_stored && diagoffc > 0 ) ) \
|
||||
if ( diagoffc < 0 ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( row_stored && bli_is_upper( uploc ) ) || \
|
||||
( col_stored && bli_is_lower( uploc ) ) ) \
|
||||
if ( bli_is_lower( uploc ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
@@ -389,8 +244,8 @@ void PASTEMAC(ch,varname) \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
|
||||
-diagoffc12 * ( doff_t )incc; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
@@ -398,16 +253,15 @@ void PASTEMAC(ch,varname) \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( &conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
|
||||
( col_stored && bli_is_upper( uploc ) ) ) */ \
|
||||
else /* if ( bli_is_upper( uploc ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
|
||||
-diagoffc10 * ( doff_t )incc; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
@@ -478,8 +332,8 @@ void PASTEMAC(ch,varname) \
|
||||
conjc, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
c11, incc, ldc, \
|
||||
p11, 1, ldp, ldp \
|
||||
); \
|
||||
\
|
||||
/* If we are packing a micro-panel with Hermitian structure,
|
||||
@@ -495,8 +349,8 @@ void PASTEMAC(ch,varname) \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
ctype_r* restrict c11_r = ( ctype_r* )c11; \
|
||||
const dim_t rs_c2 = 2*rs_c; \
|
||||
const dim_t cs_c2 = 2*cs_c; \
|
||||
const dim_t incc2 = 2*incc; \
|
||||
const dim_t ldc2 = 2*ldc; \
|
||||
\
|
||||
PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
|
||||
( \
|
||||
@@ -504,8 +358,8 @@ void PASTEMAC(ch,varname) \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
c11_r, rs_c2, cs_c2, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
c11_r, incc2, ldc2, \
|
||||
p11, 1, ldp, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
@@ -523,30 +377,28 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
) \
|
||||
{ \
|
||||
doff_t diagoffp_abs = bli_abs( diagoffp ); \
|
||||
ctype* p11 = p + (diagoffp_abs )*ldp; \
|
||||
doff_t diagoffc = panel_dim_off - panel_len_off; \
|
||||
doff_t diagoffc_abs = bli_abs( diagoffc ); \
|
||||
ctype* p11 = p + (diagoffc_abs )*ldp; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
@@ -579,7 +431,7 @@ void PASTEMAC(ch,varname) \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
kappa, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
p11, 1, ldp, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
@@ -594,7 +446,7 @@ void PASTEMAC(ch,varname) \
|
||||
0, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
p11, 1, ldp, ldp \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
@@ -610,11 +462,11 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
uplo_t uplop = uploc; \
|
||||
doff_t diagoffp11_0 = 0; \
|
||||
doff_t diagoffc11_0 = 0; \
|
||||
dim_t p11_0_dim = panel_dim - 1; \
|
||||
\
|
||||
bli_toggle_uplo( &uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \
|
||||
\
|
||||
/* Note that this macro works a little differently than the setm
|
||||
operation. Here, we pass in the dimensions of only p11, rather
|
||||
@@ -622,20 +474,51 @@ void PASTEMAC(ch,varname) \
|
||||
"shrunken" dimensions of p11, corresponding to the toggling
|
||||
and shrinking of the diagonal above. The macro will do the
|
||||
right thing, incrementing the pointer to p11 by the appropriate
|
||||
leading dimension (cs_p or rs_p), and setting only the lower
|
||||
leading dimension (ldp or rs_p), and setting only the lower
|
||||
or upper triangle to zero. */ \
|
||||
PASTEMAC(ch,set1ms_mxn_uplo) \
|
||||
( \
|
||||
schema, \
|
||||
diagoffp11_0, \
|
||||
diagoffc11_0, \
|
||||
uplop, \
|
||||
p11_0_dim, \
|
||||
p11_0_dim, \
|
||||
zero, \
|
||||
p11, rs_p, cs_p, ldp \
|
||||
p11, 1, ldp, ldp \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If this micro-panel is an edge case in both panel dimension and
|
||||
length, then it must be a bottom-right corner case, which
|
||||
typically only happens for micro-panels being packed for trsm.
|
||||
(It also happens for trmm if kr > 1.) Here, we set the part of
|
||||
the diagonal that extends into the zero-padded region to
|
||||
identity. This prevents NaNs and Infs from creeping into the
|
||||
computation. If this code does execute for trmm, it is okay,
|
||||
because those 1.0's that extend into the bottom-right region
|
||||
end up getting muliplied by the 0.0's in the zero-padded region
|
||||
of the other matrix. */ \
|
||||
if ( panel_dim != panel_dim_max && \
|
||||
panel_len != panel_len_max ) \
|
||||
{ \
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
dim_t offm = panel_dim; \
|
||||
dim_t offn = panel_len; \
|
||||
dim_t m_edge = panel_dim_max - panel_dim; \
|
||||
dim_t n_edge = panel_len_max - panel_len; \
|
||||
\
|
||||
PASTEMAC(ch,set1ms_mxn_diag) \
|
||||
( \
|
||||
schema, \
|
||||
offm, \
|
||||
offn, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
one, \
|
||||
p, 1, ldp, ldp \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )
|
||||
|
||||
@@ -38,84 +38,26 @@
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
bool invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_len_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
inc_t incc, inc_t ldc, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
inc_t ldp, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er )
|
||||
|
||||
|
||||
@@ -41,53 +41,26 @@
|
||||
\
|
||||
void PASTEMAC2(chc,chp,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
bool invdiag, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype_c* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype_p* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t ldp; \
|
||||
\
|
||||
\
|
||||
/* Determine the dimensions and relative strides of the micro-panel
|
||||
based on its pack schema. */ \
|
||||
if ( bli_is_col_packed( schema ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_dim_max = n_panel_max; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_row_packed( schema ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_dim_max = m_panel_max; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
if ( bli_is_nat_packed( schema ) ) \
|
||||
{ \
|
||||
/* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha
|
||||
@@ -318,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_p* restrict p, inc_t ldp \
|
||||
) \
|
||||
@@ -445,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_a* restrict a, inc_t inca, inc_t lda, \
|
||||
ctype_p* restrict p, inc_t ldp \
|
||||
) \
|
||||
|
||||
@@ -37,17 +37,24 @@
|
||||
\
|
||||
void PASTEMAC2(chc,chp,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
pack_t schema, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
bool invdiag, \
|
||||
dim_t panel_dim, \
|
||||
dim_t panel_len, \
|
||||
dim_t panel_dim_max, \
|
||||
dim_t panel_len_max, \
|
||||
dim_t panel_dim_off, \
|
||||
dim_t panel_len_off, \
|
||||
ctype_p* restrict kappa, \
|
||||
ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
ctype_c* restrict c, inc_t incc, inc_t ldc, \
|
||||
ctype_p* restrict p, inc_t ldp, \
|
||||
inc_t is_p, \
|
||||
cntx_t* cntx \
|
||||
cntx_t* cntx, \
|
||||
void* params \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
|
||||
|
||||
@@ -1,297 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
|
||||
|
||||
|
||||
void bli_packm_unb_var1
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_cp = bli_obj_dt( c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
diag_t diagc = bli_obj_diag( c );
|
||||
uplo_t uploc = bli_obj_uplo( c );
|
||||
trans_t transc = bli_obj_conjtrans_status( c );
|
||||
|
||||
dim_t m_p = bli_obj_length( p );
|
||||
dim_t n_p = bli_obj_width( p );
|
||||
dim_t m_max_p = bli_obj_padded_length( p );
|
||||
dim_t n_max_p = bli_obj_padded_width( p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
|
||||
void* buf_kappa;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// This variant assumes that the computational kernel will always apply
|
||||
// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
|
||||
// for kappa so that the underlying packm implementation does not scale
|
||||
// during packing.
|
||||
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_cp];
|
||||
|
||||
if( bli_thread_am_ochief( thread ) ) {
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
\
|
||||
/* We begin by packing the region indicated by the parameters. If
|
||||
matrix c is dense (either because the structure is general or
|
||||
because the structure has already been "densified"), this ends
|
||||
up being the only action we take. Note that if kappa is unit,
|
||||
the data is simply copied (rather than scaled by one). */ \
|
||||
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
diagoffc, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
transc, \
|
||||
m, \
|
||||
n, \
|
||||
kappa_cast, \
|
||||
c_cast, rs_c, cs_c, \
|
||||
p_cast, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
/* If uploc is upper or lower, then the structure of c is necessarily
|
||||
non-dense (ie: Hermitian, symmetric, or triangular, where part of the
|
||||
matrix is unstored). In these cases, we want to fill in the unstored
|
||||
part of the matrix. How this is done depends on the structure of c. */ \
|
||||
if ( bli_is_upper_or_lower( uploc ) ) \
|
||||
{ \
|
||||
/* The Hermitian and symmetric cases are almost identical, so we
|
||||
handle them in one conditional block. */ \
|
||||
if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \
|
||||
{ \
|
||||
/* First we must reflect the region referenced to the opposite
|
||||
side of the diagonal. */ \
|
||||
c_cast = c_cast + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_negate_diag_offset( &diagoffc ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
if ( bli_is_upper( uploc ) ) diagoffc += 1; \
|
||||
else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \
|
||||
\
|
||||
/* If c is Hermitian, we need to apply a conjugation when
|
||||
copying the region opposite the diagonal. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
transc = bli_trans_toggled_conj( transc ); \
|
||||
\
|
||||
/* Copy the data from the region opposite the diagonal of c
|
||||
(as specified by the original value of diagoffc). Notice
|
||||
that we use a diag parameter of non-unit since we can
|
||||
assume nothing about the neighboring off-diagonal. */ \
|
||||
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
diagoffc, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
transc, \
|
||||
m, \
|
||||
n, \
|
||||
kappa_cast, \
|
||||
c_cast, rs_c, cs_c, \
|
||||
p_cast, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
else /* if ( bli_is_triangular( strucc ) ) */ \
|
||||
{ \
|
||||
doff_t diagoffp = diagoffc; \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
/* For this step we need the uplo and diagonal offset of p, which
|
||||
we can derive from the parameters given. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_negate_diag_offset( &diagoffp ); \
|
||||
bli_toggle_uplo( &uplop ); \
|
||||
} \
|
||||
\
|
||||
/* For triangular matrices, we wish to reference the region
|
||||
strictly opposite the diagonal of C. This amounts to
|
||||
toggling uploc and then shifting the diagonal offset to
|
||||
shrink the stored region (by one diagonal). */ \
|
||||
bli_toggle_uplo( &uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. */ \
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m, \
|
||||
n, \
|
||||
zero, \
|
||||
p_cast, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
eithe region exists, we set them to zero. This simplifies the
|
||||
register level micro kernel in that it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m != m_max ) \
|
||||
{ \
|
||||
ctype* p_edge = p_cast + (m )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_max - m, \
|
||||
n_max, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
if ( n != n_max ) \
|
||||
{ \
|
||||
ctype* p_edge = p_cast + (n )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_max, \
|
||||
n_max - n, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_unb_var1 )
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_unb_var1
|
||||
(
|
||||
obj_t* c,
|
||||
obj_t* p,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_unb_var1 )
|
||||
|
||||
@@ -36,8 +36,6 @@
|
||||
#include "bli_unpackm_check.h"
|
||||
#include "bli_unpackm_int.h"
|
||||
|
||||
#include "bli_unpackm_unb_var1.h"
|
||||
|
||||
#include "bli_unpackm_blk_var1.h"
|
||||
|
||||
#include "bli_unpackm_cxk.h"
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T unpackm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
doff_t diagoffp,
|
||||
uplo_t uplop,
|
||||
trans_t transp,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1);
|
||||
|
||||
|
||||
void bli_unpackm_unb_var1
|
||||
(
|
||||
obj_t* p,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_pc = bli_obj_dt( p );
|
||||
|
||||
doff_t diagoffp = bli_obj_diag_offset( p );
|
||||
uplo_t uplop = bli_obj_uplo( p );
|
||||
trans_t transc = bli_obj_onlytrans_status( c );
|
||||
|
||||
dim_t m_c = bli_obj_length( c );
|
||||
dim_t n_c = bli_obj_width( c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( p );
|
||||
inc_t rs_p = bli_obj_row_stride( p );
|
||||
inc_t cs_p = bli_obj_col_stride( p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_pc];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffp,
|
||||
uplop,
|
||||
transc,
|
||||
m_c,
|
||||
n_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
doff_t diagoffp, \
|
||||
uplo_t uplop, \
|
||||
trans_t transp, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx \
|
||||
) \
|
||||
{ \
|
||||
ctype* p_cast = p; \
|
||||
ctype* c_cast = c; \
|
||||
\
|
||||
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
diagoffp,\
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
transp, \
|
||||
m, \
|
||||
n, \
|
||||
p_cast, rs_p, cs_p, \
|
||||
c_cast, rs_c, cs_c, \
|
||||
cntx, \
|
||||
NULL \
|
||||
); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 )
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_unpackm_unb_var1
|
||||
(
|
||||
obj_t* p,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffp, \
|
||||
uplo_t uplop, \
|
||||
trans_t transp, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( unpackm_unb_var1 )
|
||||
|
||||
@@ -35,6 +35,8 @@
|
||||
|
||||
#include "bli_l3_cntl.h"
|
||||
#include "bli_l3_check.h"
|
||||
#include "bli_l3_int.h"
|
||||
#include "bli_l3_packab.h"
|
||||
|
||||
// Define function types.
|
||||
//#include "bli_l3_ft_ex.h"
|
||||
@@ -45,7 +47,6 @@
|
||||
#include "bli_l3_blocksize.h"
|
||||
#include "bli_l3_direct.h"
|
||||
#include "bli_l3_prune.h"
|
||||
#include "bli_l3_packm.h"
|
||||
#include "bli_l3_schema.h"
|
||||
|
||||
// Prototype object APIs (basic and expert).
|
||||
|
||||
@@ -53,7 +53,7 @@ void bli_gemm_check
|
||||
// Check object structure.
|
||||
|
||||
// NOTE: Can't perform these checks as long as bli_gemm_check() is called
|
||||
// from bli_gemm_int(), which is in the execution path for structured
|
||||
// from bli_l3_int(), which is in the execution path for structured
|
||||
// level-3 operations such as hemm.
|
||||
|
||||
//e_val = bli_check_general_object( a );
|
||||
@@ -109,7 +109,7 @@ void bli_hemm_check
|
||||
}
|
||||
|
||||
void bli_herk_check
|
||||
(
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
@@ -197,7 +197,7 @@ void bli_symm_check
|
||||
}
|
||||
|
||||
void bli_syrk_check
|
||||
(
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_trsm_int
|
||||
void bli_l3_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
@@ -47,10 +47,9 @@ void bli_trsm_int
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
trsm_var_oft f;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
// Return early if the current control tree node is NULL.
|
||||
if ( bli_cntl_is_null( cntl ) ) return;
|
||||
@@ -60,72 +59,82 @@ void bli_trsm_int
|
||||
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) ) return;
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A and B in case we need to update attached scalars.
|
||||
// If A or B is marked as being filled with zeros, scale C by beta and
|
||||
// return early.
|
||||
if ( bli_obj_is_zeros( a ) ||
|
||||
bli_obj_is_zeros( b ) )
|
||||
{
|
||||
// This should never execute.
|
||||
bli_abort();
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to update attached scalars.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
|
||||
// Alias C in case we need to induce a transposition.
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Ensure that a valid packing function is set on A and B.
|
||||
if ( !bli_obj_pack_fn( &a_local ) )
|
||||
bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local );
|
||||
|
||||
if ( !bli_obj_pack_fn( &b_local ) )
|
||||
bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local );
|
||||
|
||||
// If we are about to call a leaf-level implementation, and matrix C
|
||||
// still needs a transposition, then we must induce one by swapping the
|
||||
// strides and dimensions. Note that this transposition would normally
|
||||
// be handled explicitly in the packing of C, but if C is not being
|
||||
// packed, this is our last chance to handle the transposition.
|
||||
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
|
||||
//if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
|
||||
if ( bli_obj_has_trans( c ) )
|
||||
{
|
||||
bli_obj_induce_trans( &c_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, apply it to the scalar attached to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B, unless it happens to be triangular.
|
||||
if ( bli_obj_root_is_triangular( b ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
// of the root object) and one based on the uplo field of the triangular
|
||||
// matrix's root object (whether that is matrix A or matrix B).
|
||||
if ( bli_obj_root_is_triangular( a ) )
|
||||
{
|
||||
// If alpha is non-unit, typecast and apply it to the scalar
|
||||
// attached to B (the non-triangular matrix).
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
bli_obj_scalar_apply_scalar( alpha, &a_local );
|
||||
}
|
||||
else // if ( bli_obj_root_is_triangular( b ) )
|
||||
{
|
||||
// If alpha is non-unit, typecast and apply it to the scalar
|
||||
// attached to A (the non-triangular matrix).
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &a_local );
|
||||
}
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// FGVZ->TMS: Is this barrier still needed?
|
||||
bli_thread_barrier( thread );
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
|
||||
// Create the next node in the thrinfo_t structure.
|
||||
bli_thrinfo_grow( rntm, cntl, thread );
|
||||
|
||||
// Extract the function pointer from the current control tree node.
|
||||
f = bli_cntl_var_func( cntl );
|
||||
l3_var_oft f = bli_cntl_var_func( cntl );
|
||||
|
||||
// Invoke the variant.
|
||||
f
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm_int
|
||||
void bli_l3_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
@@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENTDEF( gemm )
|
||||
|
||||
|
||||
#undef GENTDEF
|
||||
#define GENTDEF( opname ) \
|
||||
\
|
||||
typedef void (*PASTECH(opname,_var_oft)) \
|
||||
( \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
cntl_t* cntl, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENTDEF( trsm )
|
||||
GENTDEF( l3 )
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_packa
|
||||
void bli_l3_packa
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -45,12 +45,19 @@ void bli_gemm_packa
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t a_pack;
|
||||
obj_t a_local, a_pack;
|
||||
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
if ( bli_obj_has_trans( a ) )
|
||||
{
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
|
||||
}
|
||||
|
||||
// Pack matrix A according to the control tree node.
|
||||
bli_l3_packm
|
||||
bli_packm_int
|
||||
(
|
||||
a,
|
||||
&a_local,
|
||||
&a_pack,
|
||||
cntx,
|
||||
rntm,
|
||||
@@ -59,7 +66,7 @@ void bli_gemm_packa
|
||||
);
|
||||
|
||||
// Proceed with execution using packed matrix A.
|
||||
bli_gemm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a_pack,
|
||||
@@ -75,7 +82,7 @@ void bli_gemm_packa
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_gemm_packb
|
||||
void bli_l3_packb
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -86,25 +93,39 @@ void bli_gemm_packb
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t b_pack;
|
||||
obj_t bt_local, bt_pack;
|
||||
|
||||
// We always pass B^T to bli_l3_packm.
|
||||
bli_obj_alias_to( b, &bt_local );
|
||||
if ( bli_obj_has_trans( b ) )
|
||||
{
|
||||
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local );
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_obj_induce_trans( &bt_local );
|
||||
}
|
||||
|
||||
// Pack matrix B according to the control tree node.
|
||||
bli_l3_packm
|
||||
bli_packm_int
|
||||
(
|
||||
b,
|
||||
&b_pack,
|
||||
&bt_local,
|
||||
&bt_pack,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
|
||||
// Transpose packed object back to B.
|
||||
bli_obj_induce_trans( &bt_pack );
|
||||
|
||||
// Proceed with execution using packed matrix B.
|
||||
bli_gemm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
a,
|
||||
&b_pack,
|
||||
&bt_pack,
|
||||
&BLIS_ONE,
|
||||
c,
|
||||
cntx,
|
||||
@@ -32,12 +32,21 @@
|
||||
|
||||
*/
|
||||
|
||||
void bli_trsm_int
|
||||
void bli_l3_packa
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
void bli_l3_packb
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
@@ -1,187 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_l3_packm
|
||||
(
|
||||
obj_t* x,
|
||||
obj_t* x_pack,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
packbuf_t pack_buf_type;
|
||||
mem_t* cntl_mem_p;
|
||||
siz_t size_needed;
|
||||
|
||||
// FGVZ: Not sure why we need this barrier, but we do.
|
||||
bli_thread_barrier( thread );
|
||||
|
||||
// Every thread initializes x_pack and determines the size of memory
|
||||
// block needed (which gets embedded into the otherwise "blank" mem_t
|
||||
// entry in the control tree node).
|
||||
size_needed
|
||||
=
|
||||
bli_packm_init
|
||||
(
|
||||
x,
|
||||
x_pack,
|
||||
cntx,
|
||||
cntl
|
||||
);
|
||||
|
||||
// If zero was returned, no memory needs to be allocated and so we can
|
||||
// return early.
|
||||
if ( size_needed == 0 ) return;
|
||||
|
||||
// Query the pack buffer type from the control tree node.
|
||||
pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
|
||||
|
||||
// Query the address of the mem_t entry within the control tree node.
|
||||
cntl_mem_p = bli_cntl_pack_mem( cntl );
|
||||
|
||||
// Check the mem_t field in the control tree. If it is unallocated, then
|
||||
// we need to acquire a block from the memory broker and broadcast it to
|
||||
// all threads in the chief's thread group.
|
||||
if ( bli_mem_is_unalloc( cntl_mem_p ) )
|
||||
{
|
||||
mem_t* local_mem_p;
|
||||
mem_t local_mem_s;
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_packm(): acquiring mem pool block\n" );
|
||||
#endif
|
||||
|
||||
// The chief thread acquires a block from the memory broker
|
||||
// and saves the associated mem_t entry to local_mem_s.
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm,
|
||||
size_needed,
|
||||
pack_buf_type,
|
||||
&local_mem_s
|
||||
);
|
||||
}
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the contents of the chief thread's local mem_t entry to the
|
||||
// mem_t field in this thread's control tree node.
|
||||
*cntl_mem_p = *local_mem_p;
|
||||
}
|
||||
else // ( bli_mem_is_alloc( cntl_mem_p ) )
|
||||
{
|
||||
mem_t* local_mem_p;
|
||||
mem_t local_mem_s;
|
||||
|
||||
// If the mem_t entry in the control tree does NOT contain a NULL
|
||||
// buffer, then a block has already been acquired from the memory
|
||||
// broker and cached in the control tree.
|
||||
|
||||
// As a sanity check, we should make sure that the mem_t object isn't
|
||||
// associated with a block that is too small compared to the size of
|
||||
// the packed matrix buffer that is needed, according to the return
|
||||
// value from packm_init().
|
||||
siz_t cntl_mem_size = bli_mem_size( cntl_mem_p );
|
||||
|
||||
if ( cntl_mem_size < size_needed )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
{
|
||||
// The chief thread releases the existing block associated with
|
||||
// the mem_t entry in the control tree, and then re-acquires a
|
||||
// new block, saving the associated mem_t entry to local_mem_s.
|
||||
bli_pba_release
|
||||
(
|
||||
rntm,
|
||||
cntl_mem_p
|
||||
);
|
||||
bli_pba_acquire_m
|
||||
(
|
||||
rntm,
|
||||
size_needed,
|
||||
pack_buf_type,
|
||||
&local_mem_s
|
||||
);
|
||||
}
|
||||
|
||||
// Broadcast the address of the chief thread's local mem_t entry to
|
||||
// all threads.
|
||||
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
|
||||
|
||||
// Save the chief thread's local mem_t entry to the mem_t field in
|
||||
// this thread's control tree node.
|
||||
*cntl_mem_p = *local_mem_p;
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the mem_t entry is already allocated and sufficiently large,
|
||||
// then we use it as-is. No action is needed, because all threads
|
||||
// will already have the cached values in their local control
|
||||
// trees' mem_t entries, currently pointed to by cntl_mem_p.
|
||||
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Update the buffer address in x_pack to point to the buffer associated
|
||||
// with the mem_t entry acquired from the memory broker (now cached in
|
||||
// the control tree node).
|
||||
void* buf = bli_mem_buffer( cntl_mem_p );
|
||||
bli_obj_set_buffer( buf, x_pack );
|
||||
|
||||
|
||||
// Pack the contents of object x to object x_pack.
|
||||
bli_packm_int
|
||||
(
|
||||
x,
|
||||
x_pack,
|
||||
cntx,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
|
||||
// Barrier so that packing is done before computation.
|
||||
bli_thread_barrier( thread );
|
||||
}
|
||||
|
||||
@@ -34,7 +34,6 @@
|
||||
|
||||
#include "bli_gemm_cntl.h"
|
||||
#include "bli_gemm_front.h"
|
||||
#include "bli_gemm_int.h"
|
||||
|
||||
#include "bli_gemm_var.h"
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_gemm_blk_var1
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a1,
|
||||
|
||||
@@ -77,7 +77,7 @@ void bli_gemm_blk_var2
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
a,
|
||||
|
||||
@@ -71,7 +71,7 @@ void bli_gemm_blk_var3
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Perform gemm subproblem.
|
||||
bli_gemm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a1,
|
||||
|
||||
@@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
)
|
||||
{
|
||||
void_fp macro_kernel_fp;
|
||||
void_fp packa_fp;
|
||||
void_fp packb_fp;
|
||||
|
||||
// Use the function pointers to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
@@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
|
||||
else /* should never execute */ macro_kernel_fp = NULL;
|
||||
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
packb_fp = bli_packm_blk_var1;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
|
||||
(
|
||||
@@ -93,8 +88,7 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_gemm_packa, // pack the left-hand operand
|
||||
packa_fp,
|
||||
bli_l3_packa, // pack the left-hand operand
|
||||
BLIS_MR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -119,10 +113,9 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
packb_fp,
|
||||
BLIS_KR,
|
||||
bli_l3_packb, // pack the right-hand operand
|
||||
BLIS_NR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
@@ -194,8 +187,8 @@ cntl_t* bli_gemmpb_cntl_create
|
||||
(
|
||||
bli_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
BLIS_KR,
|
||||
BLIS_MR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
|
||||
@@ -87,13 +87,14 @@ void bli_gemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// Don't perform the following optimization for ccr or crc cases, as
|
||||
// those cases are sensitive to the ukernel storage preference (ie:
|
||||
// transposing the operation would break them).
|
||||
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
|
||||
#endif
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
@@ -251,7 +252,7 @@ void bli_gemm_front
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemm_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
gemm_var_oft f;
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// If C has a zero dimension, return early.
|
||||
if ( bli_obj_has_zero_dim( c ) )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B has a zero dimension, scale C by beta and return early.
|
||||
if ( bli_obj_has_zero_dim( a ) ||
|
||||
bli_obj_has_zero_dim( b ) )
|
||||
{
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
// If A or B is marked as being filled with zeros, scale C by beta and
|
||||
// return early.
|
||||
if ( bli_obj_is_zeros( a ) ||
|
||||
bli_obj_is_zeros( b ) )
|
||||
{
|
||||
// This should never execute.
|
||||
bli_abort();
|
||||
|
||||
if ( bli_thread_am_ochief( thread ) )
|
||||
bli_scalm( beta, c );
|
||||
bli_thread_barrier( thread );
|
||||
return;
|
||||
}
|
||||
|
||||
// Alias A, B, and C in case we need to update attached scalars.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// If alpha is non-unit, typecast and apply it to the scalar attached
|
||||
// to B.
|
||||
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( alpha, &b_local );
|
||||
}
|
||||
|
||||
// If beta is non-unit, typecast and apply it to the scalar attached
|
||||
// to C.
|
||||
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
|
||||
{
|
||||
bli_obj_scalar_apply_scalar( beta, &c_local );
|
||||
}
|
||||
|
||||
// Create the next node in the thrinfo_t structure.
|
||||
bli_thrinfo_grow( rntm, cntl, thread );
|
||||
|
||||
// Extract the function pointer from the current control tree node.
|
||||
f = bli_cntl_var_func( cntl );
|
||||
|
||||
// Invoke the variant.
|
||||
f
|
||||
(
|
||||
&a_local,
|
||||
&b_local,
|
||||
&c_local,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
|
||||
@@ -55,11 +55,8 @@ void PASTEMAC0(opname) \
|
||||
GENPROT( gemm_blk_var1 )
|
||||
GENPROT( gemm_blk_var2 )
|
||||
GENPROT( gemm_blk_var3 )
|
||||
GENPROT( gemm_packa )
|
||||
GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var1 )
|
||||
|
||||
GENPROT( gemm_ker_var2 )
|
||||
|
||||
|
||||
|
||||
@@ -73,7 +73,14 @@ void bli_gemmt_front
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
@@ -107,7 +114,7 @@ void bli_gemmt_front
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_GEMMT, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static gemm_var_oft vars[2] =
|
||||
static l3_var_oft vars[2] =
|
||||
{
|
||||
bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
|
||||
};
|
||||
@@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
dim_t uplo;
|
||||
gemm_var_oft f;
|
||||
dim_t uplo;
|
||||
l3_var_oft f;
|
||||
|
||||
// Set a bool based on the uplo field of C's root object.
|
||||
if ( bli_obj_root_is_lower( c ) ) uplo = 0;
|
||||
|
||||
@@ -65,6 +65,14 @@ void bli_hemm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
#ifdef BLIS_DISABLE_HEMM_RIGHT
|
||||
// NOTE: This case casts right-side hemm in terms of left side. This is
|
||||
// necessary when the current subconfiguration uses a gemm microkernel
|
||||
@@ -129,13 +137,6 @@ void bli_hemm_front
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -152,7 +153,7 @@ void bli_hemm_front
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -65,6 +65,14 @@ void bli_symm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
#ifdef BLIS_DISABLE_SYMM_RIGHT
|
||||
// NOTE: This case casts right-side symm in terms of left side. This is
|
||||
// necessary when the current subconfiguration uses a gemm microkernel
|
||||
@@ -128,13 +136,6 @@ void bli_symm_front
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -151,7 +152,7 @@ void bli_symm_front
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_GEMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -64,6 +64,14 @@ void bli_trmm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( b, &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
// We do not explicitly implement the cases where A is transposed.
|
||||
// However, we can still handle them. Specifically, if A is marked as
|
||||
// needing a transposition, we simply induce a transposition. This
|
||||
@@ -147,13 +155,6 @@ void bli_trmm_front
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -170,7 +171,7 @@ void bli_trmm_front
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_TRMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static gemm_var_oft vars[2][2] =
|
||||
static l3_var_oft vars[2][2] =
|
||||
{
|
||||
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
|
||||
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
|
||||
@@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
dim_t side;
|
||||
dim_t uplo;
|
||||
gemm_var_oft f;
|
||||
dim_t side;
|
||||
dim_t uplo;
|
||||
l3_var_oft f;
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
// of the root object) and one based on the uplo field of the triangular
|
||||
|
||||
@@ -65,6 +65,14 @@ void bli_trmm3_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
// We do not explicitly implement the cases where A is transposed.
|
||||
// However, we can still handle them. Specifically, if A is marked as
|
||||
// needing a transposition, we simply induce a transposition. This
|
||||
@@ -139,13 +147,6 @@ void bli_trmm3_front
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -162,7 +163,7 @@ void bli_trmm3_front
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
bli_l3_int,
|
||||
BLIS_TRMM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -34,7 +34,5 @@
|
||||
|
||||
#include "bli_trsm_cntl.h"
|
||||
#include "bli_trsm_front.h"
|
||||
#include "bli_trsm_int.h"
|
||||
|
||||
#include "bli_trsm_var.h"
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ void bli_trsm_blk_var1
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
|
||||
|
||||
// Isolate the diagonal block A11 and its corresponding row panel C1.
|
||||
const dim_t kc = bli_obj_width( a );
|
||||
const dim_t kc = bli_obj_width_after_trans( a );
|
||||
obj_t a11, c1;
|
||||
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
|
||||
0, kc, a, &a11 );
|
||||
@@ -96,7 +96,7 @@ void bli_trsm_blk_var1
|
||||
#endif
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a11_1,
|
||||
@@ -169,7 +169,7 @@ void bli_trsm_blk_var1
|
||||
|
||||
// Perform gemm subproblem. (Note that we use the same backend
|
||||
// function as before, since we're calling the same macrokernel.)
|
||||
bli_trsm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a11,
|
||||
|
||||
@@ -60,7 +60,7 @@ void bli_trsm_blk_var2
|
||||
bli_thread_range_ndim
|
||||
(
|
||||
direct, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
&my_start, &my_end
|
||||
);
|
||||
|
||||
// Partition along the n dimension.
|
||||
@@ -77,7 +77,7 @@ void bli_trsm_blk_var2
|
||||
i, b_alg, c, &c1 );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
a,
|
||||
|
||||
@@ -71,7 +71,7 @@ void bli_trsm_blk_var3
|
||||
i, b_alg, b, &b1 );
|
||||
|
||||
// Perform trsm subproblem.
|
||||
bli_trsm_int
|
||||
bli_l3_int
|
||||
(
|
||||
&BLIS_ONE,
|
||||
&a1,
|
||||
|
||||
@@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
)
|
||||
{
|
||||
void_fp macro_kernel_p;
|
||||
void_fp packa_fp;
|
||||
void_fp packb_fp;
|
||||
|
||||
// Use the function pointer to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
packb_fp = bli_packm_blk_var1;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
//
|
||||
@@ -95,8 +90,7 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_trsm_packa, // trsm operation's packm function for A.
|
||||
packa_fp,
|
||||
bli_l3_packa, // trsm operation's packm function for A.
|
||||
BLIS_MR,
|
||||
BLIS_MR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -133,8 +127,7 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_trsm_packa, // trsm operation's packm function for A.
|
||||
packa_fp,
|
||||
bli_l3_packa, // trsm operation's packm function for A.
|
||||
BLIS_MR,
|
||||
BLIS_MR,
|
||||
#ifdef BLIS_ENABLE_TRSM_PREINVERSION
|
||||
@@ -171,10 +164,9 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_trsm_packb,
|
||||
packb_fp,
|
||||
BLIS_MR,
|
||||
bli_l3_packb,
|
||||
BLIS_NR,
|
||||
BLIS_MR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
@@ -208,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create
|
||||
|
||||
cntl_t* bli_trsm_r_cntl_create
|
||||
(
|
||||
rntm_t* rntm,
|
||||
rntm_t* rntm,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b
|
||||
)
|
||||
@@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
// NOTE: trsm macrokernels are presently disabled for right-side execution.
|
||||
void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
|
||||
|
||||
void_fp packa_fp = bli_packm_blk_var1;
|
||||
void_fp packb_fp = bli_packm_blk_var1;
|
||||
|
||||
const opid_t family = BLIS_TRSM;
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
@@ -244,8 +233,7 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_trsm_packa,
|
||||
packa_fp,
|
||||
bli_l3_packa,
|
||||
BLIS_NR,
|
||||
BLIS_MR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -270,8 +258,7 @@ cntl_t* bli_trsm_r_cntl_create
|
||||
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
|
||||
(
|
||||
rntm,
|
||||
bli_trsm_packb,
|
||||
packb_fp,
|
||||
bli_l3_packb,
|
||||
BLIS_MR,
|
||||
BLIS_MR,
|
||||
TRUE, // do NOT invert diagonal
|
||||
|
||||
@@ -71,6 +71,14 @@ void bli_trsm_front
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( b, &c_local );
|
||||
|
||||
// Set the obj_t buffer field to the location currently implied by the row
|
||||
// and column offsets and then zero the offsets. If any of the original
|
||||
// obj_t's were views into larger matrices, this step effectively makes
|
||||
// those obj_t's "forget" their lineage.
|
||||
bli_obj_reset_origin( &a_local );
|
||||
bli_obj_reset_origin( &b_local );
|
||||
bli_obj_reset_origin( &c_local );
|
||||
|
||||
// We do not explicitly implement the cases where A is transposed.
|
||||
// However, we can still handle them. Specifically, if A is marked as
|
||||
// needing a transposition, we simply induce a transposition. This
|
||||
@@ -121,13 +129,6 @@ void bli_trsm_front
|
||||
// Set the pack schemas within the objects.
|
||||
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
|
||||
|
||||
// Set each alias as the root object.
|
||||
// NOTE: We MUST wait until we are done potentially swapping the objects
|
||||
// before setting the root fields!
|
||||
bli_obj_set_as_root( &a_local );
|
||||
bli_obj_set_as_root( &b_local );
|
||||
bli_obj_set_as_root( &c_local );
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
@@ -144,7 +145,7 @@ void bli_trsm_front
|
||||
// Invoke the internal back-end.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_trsm_int,
|
||||
bli_l3_int,
|
||||
BLIS_TRSM, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
|
||||
@@ -55,8 +55,6 @@ void PASTEMAC0(opname) \
|
||||
GENPROT( trsm_blk_var1 )
|
||||
GENPROT( trsm_blk_var2 )
|
||||
GENPROT( trsm_blk_var3 )
|
||||
GENPROT( trsm_packa )
|
||||
GENPROT( trsm_packb )
|
||||
|
||||
GENPROT( trsm_xx_ker_var2 )
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
static trsm_var_oft vars[2][2] =
|
||||
static l3_var_oft vars[2][2] =
|
||||
{
|
||||
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
|
||||
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
|
||||
@@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
dim_t side;
|
||||
dim_t uplo;
|
||||
trsm_var_oft f;
|
||||
dim_t side;
|
||||
dim_t uplo;
|
||||
l3_var_oft f;
|
||||
|
||||
// Set two bools: one based on the implied side parameter (the structure
|
||||
// of the root object) and one based on the uplo field of the triangular
|
||||
|
||||
@@ -118,6 +118,11 @@ void bli_obj_create_without_buffer
|
||||
bli_obj_set_offs( 0, 0, obj );
|
||||
bli_obj_set_diag_offset( 0, obj );
|
||||
|
||||
bli_obj_set_pack_fn( NULL, obj );
|
||||
bli_obj_set_pack_params( NULL, obj );
|
||||
bli_obj_set_ker_fn( NULL, obj );
|
||||
bli_obj_set_ker_params( NULL, obj );
|
||||
|
||||
// Set the internal scalar to 1.0.
|
||||
bli_obj_set_scalar_dt( dt, obj );
|
||||
s = bli_obj_internal_scalar_buffer( obj );
|
||||
@@ -356,7 +361,7 @@ void bli_obj_free
|
||||
|
||||
buf_a = bli_obj_buffer_at_off( a );
|
||||
|
||||
bli_zzsets( 0.0, 0.0, value );
|
||||
bli_zzsets( 0.0, 0.0, value );
|
||||
|
||||
if ( bli_obj_is_float( a ) )
|
||||
{
|
||||
@@ -500,7 +505,7 @@ void bli_adjust_strides
|
||||
// Set the column stride to indicate that this is a column vector
|
||||
// stored in column-major order. This is done for legacy reasons,
|
||||
// because we at one time we had to satisify the error checking
|
||||
// in the underlying BLAS library, which expects the leading
|
||||
// in the underlying BLAS library, which expects the leading
|
||||
// dimension to be set to at least m, even if it will never be
|
||||
// used for indexing since it is a vector and thus only has one
|
||||
// column of data.
|
||||
|
||||
@@ -282,17 +282,6 @@ void bli_pba_acquire_v
|
||||
#endif
|
||||
|
||||
|
||||
void bli_pba_rntm_set_pba
|
||||
(
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
pba_t* pba = bli_pba_query();
|
||||
|
||||
bli_rntm_set_pba( pba, rntm );
|
||||
}
|
||||
|
||||
|
||||
siz_t bli_pba_pool_size
|
||||
(
|
||||
pba_t* pba,
|
||||
|
||||
@@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba )
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
pba_t* bli_pba_query( void );
|
||||
BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
|
||||
|
||||
void bli_pba_init
|
||||
(
|
||||
@@ -144,10 +144,15 @@ void bli_pba_release
|
||||
mem_t* mem
|
||||
);
|
||||
|
||||
void bli_pba_rntm_set_pba
|
||||
BLIS_INLINE void bli_pba_rntm_set_pba
|
||||
(
|
||||
rntm_t* rntm
|
||||
);
|
||||
)
|
||||
{
|
||||
pba_t* pba = bli_pba_query();
|
||||
|
||||
bli_rntm_set_pba( pba, rntm );
|
||||
}
|
||||
|
||||
siz_t bli_pba_pool_size
|
||||
(
|
||||
|
||||
@@ -76,24 +76,39 @@ void* bli_sba_acquire
|
||||
// Query the small block pool from the rntm.
|
||||
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
|
||||
|
||||
// Query the block_size of the pool_t so that we can request the exact
|
||||
// size present.
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
|
||||
// Sanity check: Make sure the requested size is no larger than the
|
||||
// block_size field of the pool.
|
||||
if ( block_size < req_size )
|
||||
// We don't expect NULL sba_pool pointers in the normal course of BLIS
|
||||
// operation. However, there are rare instances where it is convenient
|
||||
// to support use of bli_sba_acquire() without having to pass in a valid
|
||||
// sba pool data structure. The case that inspired this branch was the
|
||||
// gemm_ukr and related test modules in the BLIS testsuite. (There, it
|
||||
// is convenient to not have to checkout an array_t from the sba, and it
|
||||
// does no harm since the malloc() happens outside of the region that
|
||||
// would be timed.)
|
||||
if ( pool == NULL )
|
||||
{
|
||||
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
|
||||
( int )block_size, ( int )req_size );
|
||||
bli_abort();
|
||||
block = bli_malloc_intl( req_size, &r_val );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Query the block_size of the pool_t so that we can request the exact
|
||||
// size present.
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
|
||||
// Check out a block using the block_size queried above.
|
||||
bli_pool_checkout_block( block_size, &pblk, pool );
|
||||
// Sanity check: Make sure the requested size is no larger than the
|
||||
// block_size field of the pool.
|
||||
if ( block_size < req_size )
|
||||
{
|
||||
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
|
||||
( int )block_size, ( int )req_size );
|
||||
bli_abort();
|
||||
}
|
||||
|
||||
// The block address is stored within the pblk_t.
|
||||
block = bli_pblk_buf( &pblk );
|
||||
// Check out a block using the block_size queried above.
|
||||
bli_pool_checkout_block( block_size, &pblk, pool );
|
||||
|
||||
// The block address is stored within the pblk_t.
|
||||
block = bli_pblk_buf( &pblk );
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
@@ -123,21 +138,28 @@ void bli_sba_release
|
||||
// Query the small block pool from the rntm.
|
||||
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
|
||||
|
||||
// Query the block_size field from the pool. This is not super-important
|
||||
// for this particular application of the pool_t (that is, the "leaf"
|
||||
// component of the sba), but it seems like good housekeeping to maintain
|
||||
// the block_size field of the pblk_t in case its ever needed/read.
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
if ( pool == NULL )
|
||||
{
|
||||
bli_free_intl( block );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Query the block_size field from the pool. This is not super-important
|
||||
// for this particular application of the pool_t (that is, the "leaf"
|
||||
// component of the sba), but it seems like good housekeeping to maintain
|
||||
// the block_size field of the pblk_t in case its ever needed/read.
|
||||
const siz_t block_size = bli_pool_block_size( pool );
|
||||
|
||||
// Embed the block's memory address into a pblk_t, along with the
|
||||
// block_size queried from the pool.
|
||||
bli_pblk_set_buf( block, &pblk );
|
||||
bli_pblk_set_block_size( block_size, &pblk );
|
||||
// Embed the block's memory address into a pblk_t, along with the
|
||||
// block_size queried from the pool.
|
||||
bli_pblk_set_buf( block, &pblk );
|
||||
bli_pblk_set_block_size( block_size, &pblk );
|
||||
|
||||
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
|
||||
// a local variable since its contents are copied into the pool's internal
|
||||
// data structure--an array of pblk_t.)
|
||||
bli_pool_checkin_block( &pblk, pool );
|
||||
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
|
||||
// a local variable since its contents are copied into the pool's internal
|
||||
// data structure--an array of pblk_t.)
|
||||
bli_pool_checkin_block( &pblk, pool );
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
|
||||
@@ -1189,52 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
|
||||
|
||||
// -- User-provided information macros --
|
||||
|
||||
// User data query
|
||||
|
||||
BLIS_INLINE void* bli_obj_user_data( obj_t* obj )
|
||||
{
|
||||
return obj->user_data;
|
||||
}
|
||||
|
||||
// User data modification
|
||||
|
||||
BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj )
|
||||
{
|
||||
obj->user_data = data;
|
||||
}
|
||||
|
||||
// Function pointer query
|
||||
|
||||
BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
|
||||
{
|
||||
return obj->pack;
|
||||
return obj->pack_fn;
|
||||
}
|
||||
|
||||
BLIS_INLINE void* bli_obj_pack_params( obj_t* obj )
|
||||
{
|
||||
return obj->pack_params;
|
||||
}
|
||||
|
||||
BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj )
|
||||
{
|
||||
return obj->ker;
|
||||
return obj->ker_fn;
|
||||
}
|
||||
|
||||
BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj )
|
||||
BLIS_INLINE void* bli_obj_ker_params( obj_t* obj )
|
||||
{
|
||||
return obj->ukr;
|
||||
return obj->ker_params;
|
||||
}
|
||||
|
||||
// Function pointer modification
|
||||
|
||||
BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj )
|
||||
BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj )
|
||||
{
|
||||
obj->pack = pack;
|
||||
obj->pack_fn = pack_fn;
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj )
|
||||
BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj )
|
||||
{
|
||||
obj->ker = ker;
|
||||
obj->pack_params = params;
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj )
|
||||
BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj )
|
||||
{
|
||||
obj->ukr = ukr;
|
||||
obj->ker_fn = ker_fn;
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj )
|
||||
{
|
||||
obj->ker_params = params;
|
||||
}
|
||||
|
||||
|
||||
@@ -1357,6 +1353,18 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
|
||||
);
|
||||
}
|
||||
|
||||
// Adjust the pointer based on current offsets, zero the offsets, and then
|
||||
// set the current object as the root. For obj_t's with at least one non-zero
|
||||
// offset, this effectively makes the obj_t "forget" that it was ever a view
|
||||
// into a larger matrix.
|
||||
|
||||
BLIS_INLINE void bli_obj_reset_origin( obj_t* obj )
|
||||
{
|
||||
bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj );
|
||||
bli_obj_set_offs( 0, 0, obj );
|
||||
bli_obj_set_as_root( obj );
|
||||
}
|
||||
|
||||
// Make a full alias (shallow copy).
|
||||
|
||||
BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b )
|
||||
@@ -1482,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t*
|
||||
|
||||
BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b )
|
||||
{
|
||||
bool a_root_is_self = ( bli_obj_root( a ) == a );
|
||||
bool b_root_is_self = ( bli_obj_root( b ) == b );
|
||||
|
||||
obj_t t = *b; *b = *a; *a = t;
|
||||
|
||||
if ( a_root_is_self ) bli_obj_set_as_root( b );
|
||||
if ( b_root_is_self ) bli_obj_set_as_root( a );
|
||||
}
|
||||
|
||||
// Swap object pack schemas.
|
||||
|
||||
@@ -1174,12 +1174,11 @@ struct thrinfo_s;
|
||||
|
||||
typedef void (*obj_pack_fn_t)
|
||||
(
|
||||
mdim_t mat,
|
||||
mem_t* mem,
|
||||
struct obj_s* a,
|
||||
struct obj_s* ap,
|
||||
struct cntx_s* cntx,
|
||||
struct rntm_s* rntm,
|
||||
struct cntl_s* cntl,
|
||||
struct thrinfo_s* thread
|
||||
);
|
||||
|
||||
@@ -1190,23 +1189,10 @@ typedef void (*obj_ker_fn_t)
|
||||
struct obj_s* c,
|
||||
struct cntx_s* cntx,
|
||||
struct rntm_s* rntm,
|
||||
struct cntl_s* cntl,
|
||||
struct thrinfo_s* thread
|
||||
);
|
||||
|
||||
typedef void (*obj_ukr_fn_t)
|
||||
(
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* restrict data,
|
||||
struct cntx_s* restrict cntx
|
||||
);
|
||||
|
||||
typedef struct obj_s
|
||||
{
|
||||
// Basic fields
|
||||
@@ -1237,13 +1223,11 @@ typedef struct obj_s
|
||||
dim_t m_panel; // m dimension of a "full" panel
|
||||
dim_t n_panel; // n dimension of a "full" panel
|
||||
|
||||
// User data pointer
|
||||
void* user_data;
|
||||
|
||||
// Function pointers
|
||||
obj_pack_fn_t pack;
|
||||
obj_ker_fn_t ker;
|
||||
obj_ukr_fn_t ukr;
|
||||
// User-customizable fields
|
||||
obj_pack_fn_t pack_fn;
|
||||
void* pack_params;
|
||||
obj_ker_fn_t ker_fn;
|
||||
void* ker_params;
|
||||
|
||||
} obj_t;
|
||||
|
||||
@@ -1258,70 +1242,68 @@ typedef struct obj_s
|
||||
|
||||
#define BLIS_OBJECT_INITIALIZER \
|
||||
{ \
|
||||
.root = NULL, \
|
||||
.root = NULL, \
|
||||
\
|
||||
.off = { 0, 0 }, \
|
||||
.dim = { 0, 0 }, \
|
||||
.diag_off = 0, \
|
||||
.off = { 0, 0 }, \
|
||||
.dim = { 0, 0 }, \
|
||||
.diag_off = 0, \
|
||||
\
|
||||
.info = 0x0 | BLIS_BITVAL_DENSE | \
|
||||
BLIS_BITVAL_GENERAL, \
|
||||
.info2 = 0x0, \
|
||||
.elem_size = sizeof( float ), /* this is changed later. */ \
|
||||
.info = 0x0 | BLIS_BITVAL_DENSE | \
|
||||
BLIS_BITVAL_GENERAL, \
|
||||
.info2 = 0x0, \
|
||||
.elem_size = sizeof( float ), /* this is changed later. */ \
|
||||
\
|
||||
.buffer = NULL, \
|
||||
.rs = 0, \
|
||||
.cs = 0, \
|
||||
.is = 1, \
|
||||
.buffer = NULL, \
|
||||
.rs = 0, \
|
||||
.cs = 0, \
|
||||
.is = 1, \
|
||||
\
|
||||
.scalar = { 0.0, 0.0 }, \
|
||||
.scalar = { 0.0, 0.0 }, \
|
||||
\
|
||||
.m_padded = 0, \
|
||||
.n_padded = 0, \
|
||||
.ps = 0, \
|
||||
.pd = 0, \
|
||||
.m_panel = 0, \
|
||||
.n_panel = 0, \
|
||||
.m_padded = 0, \
|
||||
.n_padded = 0, \
|
||||
.ps = 0, \
|
||||
.pd = 0, \
|
||||
.m_panel = 0, \
|
||||
.n_panel = 0, \
|
||||
\
|
||||
.user_data = NULL, \
|
||||
\
|
||||
.pack = NULL, \
|
||||
.ker = NULL, \
|
||||
.ukr = NULL \
|
||||
.pack_fn = NULL, \
|
||||
.pack_params = NULL, \
|
||||
.ker_fn = NULL, \
|
||||
.ker_params = NULL \
|
||||
}
|
||||
|
||||
#define BLIS_OBJECT_INITIALIZER_1X1 \
|
||||
{ \
|
||||
.root = NULL, \
|
||||
.root = NULL, \
|
||||
\
|
||||
.off = { 0, 0 }, \
|
||||
.dim = { 1, 1 }, \
|
||||
.diag_off = 0, \
|
||||
.off = { 0, 0 }, \
|
||||
.dim = { 1, 1 }, \
|
||||
.diag_off = 0, \
|
||||
\
|
||||
.info = 0x0 | BLIS_BITVAL_DENSE | \
|
||||
BLIS_BITVAL_GENERAL, \
|
||||
.info2 = 0x0, \
|
||||
.elem_size = sizeof( float ), /* this is changed later. */ \
|
||||
.info = 0x0 | BLIS_BITVAL_DENSE | \
|
||||
BLIS_BITVAL_GENERAL, \
|
||||
.info2 = 0x0, \
|
||||
.elem_size = sizeof( float ), /* this is changed later. */ \
|
||||
\
|
||||
.buffer = NULL, \
|
||||
.rs = 0, \
|
||||
.cs = 0, \
|
||||
.is = 1, \
|
||||
.buffer = NULL, \
|
||||
.rs = 0, \
|
||||
.cs = 0, \
|
||||
.is = 1, \
|
||||
\
|
||||
.scalar = { 0.0, 0.0 }, \
|
||||
.scalar = { 0.0, 0.0 }, \
|
||||
\
|
||||
.m_padded = 0, \
|
||||
.n_padded = 0, \
|
||||
.ps = 0, \
|
||||
.pd = 0, \
|
||||
.m_panel = 0, \
|
||||
.n_panel = 0, \
|
||||
.m_padded = 0, \
|
||||
.n_padded = 0, \
|
||||
.ps = 0, \
|
||||
.pd = 0, \
|
||||
.m_panel = 0, \
|
||||
.n_panel = 0, \
|
||||
\
|
||||
.user_data = NULL, \
|
||||
\
|
||||
.pack = NULL, \
|
||||
.ker = NULL, \
|
||||
.ukr = NULL \
|
||||
.pack_fn = NULL, \
|
||||
.pack_params = NULL, \
|
||||
.ker_fn = NULL, \
|
||||
.ker_params = NULL \
|
||||
}
|
||||
|
||||
// Define these macros here since they must be updated if contents of
|
||||
@@ -1329,77 +1311,75 @@ typedef struct obj_s
|
||||
|
||||
BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
|
||||
{
|
||||
b->root = a->root;
|
||||
b->root = a->root;
|
||||
|
||||
b->off[0] = a->off[0];
|
||||
b->off[1] = a->off[1];
|
||||
b->dim[0] = a->dim[0];
|
||||
b->dim[1] = a->dim[1];
|
||||
b->diag_off = a->diag_off;
|
||||
b->off[0] = a->off[0];
|
||||
b->off[1] = a->off[1];
|
||||
b->dim[0] = a->dim[0];
|
||||
b->dim[1] = a->dim[1];
|
||||
b->diag_off = a->diag_off;
|
||||
|
||||
b->info = a->info;
|
||||
b->info2 = a->info2;
|
||||
b->elem_size = a->elem_size;
|
||||
b->info = a->info;
|
||||
b->info2 = a->info2;
|
||||
b->elem_size = a->elem_size;
|
||||
|
||||
b->buffer = a->buffer;
|
||||
b->rs = a->rs;
|
||||
b->cs = a->cs;
|
||||
b->is = a->is;
|
||||
b->buffer = a->buffer;
|
||||
b->rs = a->rs;
|
||||
b->cs = a->cs;
|
||||
b->is = a->is;
|
||||
|
||||
b->scalar = a->scalar;
|
||||
b->scalar = a->scalar;
|
||||
|
||||
//b->pack_mem = a->pack_mem;
|
||||
b->m_padded = a->m_padded;
|
||||
b->n_padded = a->n_padded;
|
||||
b->ps = a->ps;
|
||||
b->pd = a->pd;
|
||||
b->m_panel = a->m_panel;
|
||||
b->n_panel = a->n_panel;
|
||||
//b->pack_mem = a->pack_mem;
|
||||
b->m_padded = a->m_padded;
|
||||
b->n_padded = a->n_padded;
|
||||
b->ps = a->ps;
|
||||
b->pd = a->pd;
|
||||
b->m_panel = a->m_panel;
|
||||
b->n_panel = a->n_panel;
|
||||
|
||||
b->user_data = a->user_data;
|
||||
|
||||
b->pack = a->pack;
|
||||
b->ker = a->ker;
|
||||
b->ukr = a->ukr;
|
||||
b->pack_fn = a->pack_fn;
|
||||
b->pack_params = a->pack_params;
|
||||
b->ker_fn = a->ker_fn;
|
||||
b->ker_params = a->ker_params;
|
||||
}
|
||||
|
||||
BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b )
|
||||
{
|
||||
b->root = a->root;
|
||||
b->root = a->root;
|
||||
|
||||
b->off[0] = a->off[0];
|
||||
b->off[1] = a->off[1];
|
||||
b->off[0] = a->off[0];
|
||||
b->off[1] = a->off[1];
|
||||
// Avoid copying m and n since they will be overwritten.
|
||||
//b->dim[0] = a->dim[0];
|
||||
//b->dim[1] = a->dim[1];
|
||||
b->diag_off = a->diag_off;
|
||||
//b->dim[0] = a->dim[0];
|
||||
//b->dim[1] = a->dim[1];
|
||||
b->diag_off = a->diag_off;
|
||||
|
||||
b->info = a->info;
|
||||
b->info2 = a->info2;
|
||||
b->elem_size = a->elem_size;
|
||||
b->info = a->info;
|
||||
b->info2 = a->info2;
|
||||
b->elem_size = a->elem_size;
|
||||
|
||||
b->buffer = a->buffer;
|
||||
b->rs = a->rs;
|
||||
b->cs = a->cs;
|
||||
b->is = a->is;
|
||||
b->buffer = a->buffer;
|
||||
b->rs = a->rs;
|
||||
b->cs = a->cs;
|
||||
b->is = a->is;
|
||||
|
||||
b->scalar = a->scalar;
|
||||
b->scalar = a->scalar;
|
||||
|
||||
// Avoid copying pack_mem entry.
|
||||
// FGVZ: You should probably make sure this is right.
|
||||
//b->pack_mem = a->pack_mem;
|
||||
b->m_padded = a->m_padded;
|
||||
b->n_padded = a->n_padded;
|
||||
b->ps = a->ps;
|
||||
b->pd = a->pd;
|
||||
b->m_panel = a->m_panel;
|
||||
b->n_panel = a->n_panel;
|
||||
//b->pack_mem = a->pack_mem;
|
||||
b->m_padded = a->m_padded;
|
||||
b->n_padded = a->n_padded;
|
||||
b->ps = a->ps;
|
||||
b->pd = a->pd;
|
||||
b->m_panel = a->m_panel;
|
||||
b->n_panel = a->n_panel;
|
||||
|
||||
b->user_data = a->user_data;
|
||||
|
||||
b->pack = a->pack;
|
||||
b->ker = a->ker;
|
||||
b->ukr = a->ukr;
|
||||
b->pack_fn = a->pack_fn;
|
||||
b->pack_params = a->pack_params;
|
||||
b->ker_fn = a->ker_fn;
|
||||
b->ker_params = a->ker_params;
|
||||
}
|
||||
|
||||
// Initializors for global scalar constants.
|
||||
|
||||
@@ -169,7 +169,6 @@ void libblis_test_gemm_ukr_experiment
|
||||
num_t datatype;
|
||||
|
||||
dim_t m, n, k;
|
||||
inc_t ldap, ldbp;
|
||||
|
||||
char sc_a = 'c';
|
||||
char sc_b = 'r';
|
||||
@@ -194,11 +193,6 @@ void libblis_test_gemm_ukr_experiment
|
||||
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
|
||||
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
|
||||
// respectively.
|
||||
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
|
||||
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
// Store the register blocksizes so that the driver can retrieve the
|
||||
// values later when printing results.
|
||||
op->dim_aux[0] = m;
|
||||
@@ -237,7 +231,13 @@ void libblis_test_gemm_ukr_experiment
|
||||
libblis_test_mobj_randomize( params, TRUE, &c );
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0
|
||||
rntm_t rntm;
|
||||
bli_rntm_init( &rntm );
|
||||
bli_pba_rntm_set_pba( &rntm );
|
||||
|
||||
// Transpose B to B^T for packing.
|
||||
bli_obj_induce_trans( &b );
|
||||
|
||||
// Create pack objects for a and b, and pack them to ap and bp,
|
||||
// respectively.
|
||||
cntl_t* cntl_a = libblis_test_pobj_create
|
||||
@@ -248,56 +248,26 @@ void libblis_test_gemm_ukr_experiment
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
&a, &ap,
|
||||
cntx
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
cntl_t* cntl_b = libblis_test_pobj_create
|
||||
(
|
||||
BLIS_KR,
|
||||
BLIS_NR,
|
||||
BLIS_KR,
|
||||
BLIS_NO_INVERT_DIAG,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&b, &bp,
|
||||
cntx
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
#endif
|
||||
|
||||
// Create the packed objects. Use packmr and packnr as the leading
|
||||
// dimensions of ap and bp, respectively. Note that we use the ldims
|
||||
// instead of the matrix dimensions for allocation purposes here.
|
||||
// This is a little hacky and was prompted when trying to support
|
||||
// configurations such as power9 that employ duplication/broadcasting
|
||||
// of elements in one of the packed matrix objects. Thankfully, packm
|
||||
// doesn't care about those dimensions and instead relies on
|
||||
// information taken from the source object. Thus, this is merely
|
||||
// about coaxing bli_obj_create() in allocating enough space for our
|
||||
// purposes.
|
||||
bli_obj_create( datatype, ldap, k, 1, ldap, &ap );
|
||||
bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp );
|
||||
// Transpose B^T back to B and Bp^T back to Bp.
|
||||
bli_obj_induce_trans( &b );
|
||||
bli_obj_induce_trans( &bp );
|
||||
|
||||
// Set up the objects for packing. Calling packm_init_pack() does everything
|
||||
// except checkout a memory pool block and save its address to the obj_t's.
|
||||
// However, it does overwrite the buffer field of packed object with that of
|
||||
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
|
||||
// field would normally be overwritten yet again by the address from the
|
||||
// memory pool block). So, we have to save the buffer address that was
|
||||
// allocated so we can re-store it to the object afterward.
|
||||
void* buf_ap = bli_obj_buffer( &ap );
|
||||
void* buf_bp = bli_obj_buffer( &bp );
|
||||
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_MR, BLIS_KR, &a, &ap, cntx );
|
||||
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_KR, BLIS_NR, &b, &bp, cntx );
|
||||
bli_obj_set_buffer( buf_ap, &ap );
|
||||
bli_obj_set_buffer( buf_bp, &bp );
|
||||
|
||||
// Pack the data from the source objects.
|
||||
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
for ( i = 0; i < n_repeats; ++i )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
@@ -321,16 +291,10 @@ void libblis_test_gemm_ukr_experiment
|
||||
// Zero out performance and residual if output matrix is empty.
|
||||
libblis_test_check_empty_problem( &c, perf, resid );
|
||||
|
||||
#if 0
|
||||
// Free the control tree nodes and release their cached mem_t entries
|
||||
// back to the memory broker.
|
||||
bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
#endif
|
||||
|
||||
// Free the packed objects.
|
||||
bli_obj_free( &ap );
|
||||
bli_obj_free( &bp );
|
||||
// back to the pba.
|
||||
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Free the test objects.
|
||||
bli_obj_free( &a );
|
||||
|
||||
@@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment
|
||||
bli_copym( &b11, &c11 );
|
||||
bli_copym( &c11, &c11_save );
|
||||
|
||||
#if 0
|
||||
rntm_t rntm;
|
||||
bli_rntm_init( &rntm );
|
||||
bli_pba_rntm_set_pba( &rntm );
|
||||
|
||||
// Create pack objects for a and b, and pack them to ap and bp,
|
||||
// respectively.
|
||||
cntl_t* cntl_a = libblis_test_pobj_create
|
||||
@@ -294,59 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
&a, &ap,
|
||||
&cntx
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
cntl_t* cntl_b = libblis_test_pobj_create
|
||||
(
|
||||
BLIS_MR,
|
||||
BLIS_NR,
|
||||
BLIS_NO_INVERT_DIAG,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&b, &bp,
|
||||
&cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Create the packed objects. Use packmr and packnr as the leading
|
||||
// dimensions of ap and bp, respectively. Note that we use the ldims
|
||||
// instead of the matrix dimensions for allocation purposes here.
|
||||
// This is a little hacky and was prompted when trying to support
|
||||
// configurations such as power9 that employ duplication/broadcasting
|
||||
// of elements in one of the packed matrix objects. Thankfully, packm
|
||||
// doesn't care about those dimensions and instead relies on
|
||||
// information taken from the source object. Thus, this is merely
|
||||
// about coaxing bli_obj_create() in allocating enough space for our
|
||||
// purposes.
|
||||
bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap );
|
||||
bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp );
|
||||
|
||||
// We overwrite the m dimension of ap and n dimension of bp with
|
||||
// m and n, respectively, so that these objects contain the correct
|
||||
// logical dimensions. Recall that ldap and ldbp were used only to
|
||||
// induce bli_obj_create() to allocate sufficient memory for the
|
||||
// duplication in rare instances where the subconfig uses a gemm
|
||||
// ukernel that duplicates elements in one of the operands.
|
||||
bli_obj_set_length( m, &ap );
|
||||
bli_obj_set_width( n, &bp );
|
||||
|
||||
// Set up the objects for packing. Calling packm_init_pack() does everything
|
||||
// except checkout a memory pool block and save its address to the obj_t's.
|
||||
// However, it does overwrite the buffer field of packed object with that of
|
||||
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
|
||||
// field would normally be overwritten yet again by the address from the
|
||||
// memory pool block). So, we have to save the buffer address that was
|
||||
// allocated so we can re-store it to the object afterward.
|
||||
void* buf_ap = bli_obj_buffer( &ap );
|
||||
void* buf_bp = bli_obj_buffer( &bp );
|
||||
bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_MR, BLIS_KR, &a, &ap, cntx );
|
||||
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_KR, BLIS_NR, &b, &bp, cntx );
|
||||
bli_obj_set_buffer( buf_ap, &ap );
|
||||
bli_obj_set_buffer( buf_bp, &bp );
|
||||
|
||||
// Set the diagonal offset of ap.
|
||||
if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); }
|
||||
@@ -357,32 +310,45 @@ void libblis_test_gemmtrsm_ukr_experiment
|
||||
// to know how to initialize the subpartitions.
|
||||
bli_obj_set_uplo( uploa, &ap );
|
||||
|
||||
// Pack the data from the source objects.
|
||||
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Create subpartitions from the a and b panels.
|
||||
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
|
||||
&a1xp, &a11p, &bx1p, &b11p );
|
||||
|
||||
// Set the uplo field of a11p since the default for packed objects is
|
||||
// BLIS_DENSE, and the _ukernel() wrapper needs this information to
|
||||
// know which set of micro-kernels (lower or upper) to choose from.
|
||||
bli_obj_set_uplo( uploa, &a11p );
|
||||
|
||||
#if 0
|
||||
bli_printm( "a", &a, "%5.2f", "" );
|
||||
bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
#endif
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
cntl_t* cntl_b = NULL;
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
for ( i = 0; i < n_repeats; ++i )
|
||||
{
|
||||
bli_copym( &c11_save, &c11 );
|
||||
|
||||
// Re-pack (restore) the contents of b to bp.
|
||||
//bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
// Transpose B to B^T for packing.
|
||||
bli_obj_induce_trans( &b );
|
||||
|
||||
cntl_b = libblis_test_pobj_create
|
||||
(
|
||||
BLIS_NR,
|
||||
BLIS_MR,
|
||||
BLIS_NO_INVERT_DIAG,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&b, &bp,
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
|
||||
// Transpose B^T back to B and Bp^T back to Bp.
|
||||
bli_obj_induce_trans( &b );
|
||||
bli_obj_induce_trans( &bp );
|
||||
|
||||
// Create subpartitions from the a and b panels.
|
||||
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
|
||||
&a1xp, &a11p, &bx1p, &b11p );
|
||||
|
||||
// Set the uplo field of a11p since the default for packed objects is
|
||||
// BLIS_DENSE, and the _ukernel() wrapper needs this information to
|
||||
// know which set of micro-kernels (lower or upper) to choose from.
|
||||
bli_obj_set_uplo( uploa, &a11p );
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
@@ -391,6 +357,15 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
cntx );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
|
||||
// On the last pass, we must keep the packed B buffer checked out in order
|
||||
// to perform the correctness check later.
|
||||
if ( i < n_repeats - 1 )
|
||||
{
|
||||
// Free the control tree nodes and release their cached mem_t entries
|
||||
// back to the memory broker.
|
||||
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -426,16 +401,11 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
// Zero out performance and residual if output matrix is empty.
|
||||
//libblis_test_check_empty_problem( &c11, perf, resid );
|
||||
|
||||
#if 0
|
||||
// Free the control tree nodes and release their cached mem_t entries
|
||||
// back to the memory broker.
|
||||
bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
#endif
|
||||
|
||||
// Free the packed objects.
|
||||
bli_obj_free( &ap );
|
||||
bli_obj_free( &bp );
|
||||
// back to the pba.
|
||||
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
if ( cntl_b )
|
||||
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Free the test objects.
|
||||
bli_obj_free( &a_big );
|
||||
|
||||
@@ -636,7 +636,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
|
||||
int i, p;
|
||||
|
||||
// Initialize the operation type field.
|
||||
op->opid = opid;
|
||||
op->opid = opid;
|
||||
|
||||
// Read the line for the overall operation switch.
|
||||
libblis_test_read_next_line( buffer, input_stream );
|
||||
@@ -671,7 +671,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
|
||||
//printf( "buffer[p]: %s\n", &buffer[p] );
|
||||
|
||||
// Advance until we hit non-whitespace (ie: the next number).
|
||||
for ( ; isspace( buffer[p] ); ++p ) ;
|
||||
for ( ; isspace( buffer[p] ); ++p ) ;
|
||||
|
||||
//printf( "buffer[p] after: %s\n", &buffer[p] );
|
||||
|
||||
@@ -680,7 +680,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
|
||||
//printf( "dim[%d] = %d\n", i, op->dim_spec[i] );
|
||||
|
||||
// Advance until we hit whitespace (ie: the space before the next number).
|
||||
for ( ; !isspace( buffer[p] ); ++p ) ;
|
||||
for ( ; !isspace( buffer[p] ); ++p ) ;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -778,11 +778,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
// convert these values into strings, with "unset" being used if the
|
||||
// value returned was -1 (indicating the environment variable was unset).
|
||||
dim_t nt = bli_thread_get_num_threads();
|
||||
dim_t jc_nt = bli_thread_get_jc_nt();
|
||||
dim_t pc_nt = bli_thread_get_pc_nt();
|
||||
dim_t ic_nt = bli_thread_get_ic_nt();
|
||||
dim_t jr_nt = bli_thread_get_jr_nt();
|
||||
dim_t ir_nt = bli_thread_get_ir_nt();
|
||||
dim_t jc_nt = bli_thread_get_jc_nt();
|
||||
dim_t pc_nt = bli_thread_get_pc_nt();
|
||||
dim_t ic_nt = bli_thread_get_ic_nt();
|
||||
dim_t jr_nt = bli_thread_get_jr_nt();
|
||||
dim_t ir_nt = bli_thread_get_ir_nt();
|
||||
|
||||
if ( nt == -1 ) sprintf( nt_str, "unset" );
|
||||
else sprintf( nt_str, "%d", ( int ) nt );
|
||||
@@ -1739,7 +1739,7 @@ void libblis_test_op_driver
|
||||
= ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) );
|
||||
|
||||
for ( o = 0; o < n_operands; ++o )
|
||||
{
|
||||
{
|
||||
unsigned int ij;
|
||||
operand_t operand_type
|
||||
= libblis_test_get_operand_type_for_char( o_types[o] );
|
||||
@@ -2181,7 +2181,7 @@ void libblis_test_op_driver
|
||||
ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype );
|
||||
|
||||
// Loop over the requested parameter combinations.
|
||||
for ( pci = 0; pci < n_param_combos; ++pci )
|
||||
for ( pci = 0; pci < n_param_combos; ++pci )
|
||||
{
|
||||
// Loop over the requested problem sizes.
|
||||
for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi )
|
||||
@@ -2403,7 +2403,7 @@ void libblis_test_build_function_string
|
||||
if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH )
|
||||
libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n",
|
||||
strlen( funcname_str ), MAX_FUNC_STRING_LENGTH );
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -2545,7 +2545,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
|
||||
dim_t n_trans = n;
|
||||
dim_t rs = 1; // Initialization avoids a compiler warning.
|
||||
dim_t cs = 1; // Initialization avoids a compiler warning.
|
||||
|
||||
|
||||
// Apply the trans parameter to the dimensions (if needed).
|
||||
bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans );
|
||||
|
||||
@@ -2591,12 +2591,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
|
||||
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm )
|
||||
{
|
||||
bool does_inv_diag;
|
||||
rntm_t rntm;
|
||||
|
||||
if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
|
||||
else does_inv_diag = TRUE;
|
||||
@@ -2606,7 +2603,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
|
||||
(
|
||||
NULL, // we don't need the small block allocator from the runtime.
|
||||
NULL, // func ptr is not referenced b/c we don't call via l3 _int().
|
||||
bli_packm_blk_var1,
|
||||
bmult_id_m,
|
||||
bmult_id_n,
|
||||
does_inv_diag,
|
||||
@@ -2617,20 +2613,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
|
||||
NULL // no child node needed
|
||||
);
|
||||
|
||||
// Initialize a local-to-BLIS rntm_t. This is simply so we have something
|
||||
// to pass into bli_l3_packm(). The function doesn't (currently) use the
|
||||
// runtime object, and even if it did, one with default values would work
|
||||
// fine here.
|
||||
bli_rntm_init( &rntm );
|
||||
|
||||
// Pack the contents of A to P.
|
||||
bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Return the control tree pointer so the caller can free the cntl_t and its
|
||||
// mem_t entry later on.
|
||||
return cntl;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x )
|
||||
@@ -2975,7 +2964,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg
|
||||
char* the_string;
|
||||
char the_char;
|
||||
|
||||
// Begin looping over message to insert variables wherever there are
|
||||
// Begin looping over message to insert variables wherever there are
|
||||
// format specifiers.
|
||||
for ( c = 0; message[c] != '\0'; )
|
||||
{
|
||||
|
||||
@@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
|
||||
// --- Create object ---
|
||||
|
||||
void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
|
||||
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
|
||||
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm );
|
||||
void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
|
||||
|
||||
// --- Randomize/initialize object ---
|
||||
|
||||
@@ -171,7 +171,6 @@ void libblis_test_trsm_ukr_experiment
|
||||
num_t datatype;
|
||||
|
||||
dim_t m, n;
|
||||
inc_t ldap, ldbp;
|
||||
|
||||
char sc_a = 'c';
|
||||
char sc_b = 'r';
|
||||
@@ -196,11 +195,6 @@ void libblis_test_trsm_ukr_experiment
|
||||
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
|
||||
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
|
||||
// respectively.
|
||||
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
|
||||
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
|
||||
|
||||
// Store the register blocksizes so that the driver can retrieve the
|
||||
// values later when printing results.
|
||||
op->dim_aux[0] = m;
|
||||
@@ -238,7 +232,10 @@ void libblis_test_trsm_ukr_experiment
|
||||
libblis_test_mobj_randomize( params, TRUE, &c );
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#if 0
|
||||
rntm_t rntm;
|
||||
bli_rntm_init( &rntm );
|
||||
bli_pba_rntm_set_pba( &rntm );
|
||||
|
||||
// Create pack objects for a and b, and pack them to ap and bp,
|
||||
// respectively.
|
||||
cntl_t* cntl_a = libblis_test_pobj_create
|
||||
@@ -249,50 +246,9 @@ void libblis_test_trsm_ukr_experiment
|
||||
BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_BUFFER_FOR_A_BLOCK,
|
||||
&a, &ap,
|
||||
cntx
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
cntl_t* cntl_b = libblis_test_pobj_create
|
||||
(
|
||||
BLIS_MR,
|
||||
BLIS_NR,
|
||||
BLIS_NO_INVERT_DIAG,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&b, &bp,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Create the packed objects. Use packmr and packnr as the leading
|
||||
// dimensions of ap and bp, respectively. Note that we use the ldims
|
||||
// instead of the matrix dimensions for allocation purposes here.
|
||||
// This is a little hacky and was prompted when trying to support
|
||||
// configurations such as power9 that employ duplication/broadcasting
|
||||
// of elements in one of the packed matrix objects. Thankfully, packm
|
||||
// doesn't care about those dimensions and instead relies on
|
||||
// information taken from the source object. Thus, this is merely
|
||||
// about coaxing bli_obj_create() in allocating enough space for our
|
||||
// purposes.
|
||||
bli_obj_create( datatype, ldap, m, 1, ldap, &ap );
|
||||
bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp );
|
||||
|
||||
// Set up the objects for packing. Calling packm_init_pack() does everything
|
||||
// except checkout a memory pool block and save its address to the obj_t's.
|
||||
// However, it does overwrite the buffer field of packed object with that of
|
||||
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
|
||||
// field would normally be overwritten yet again by the address from the
|
||||
// memory pool block). So, we have to save the buffer address that was
|
||||
// allocated so we can re-store it to the object afterward.
|
||||
void* buf_ap = bli_obj_buffer( &ap );
|
||||
void* buf_bp = bli_obj_buffer( &bp );
|
||||
bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_MR, BLIS_KR, &a, &ap, cntx );
|
||||
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
|
||||
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
|
||||
BLIS_KR, BLIS_NR, &b, &bp, cntx );
|
||||
bli_obj_set_buffer( buf_ap, &ap );
|
||||
bli_obj_set_buffer( buf_bp, &bp );
|
||||
|
||||
// Set the diagonal offset of ap.
|
||||
bli_obj_set_diag_offset( 0, &ap );
|
||||
@@ -302,24 +258,35 @@ void libblis_test_trsm_ukr_experiment
|
||||
// know which set of micro-kernels (lower or upper) to choose from.
|
||||
bli_obj_set_uplo( uploa, &ap );
|
||||
|
||||
// Pack the data from the source objects.
|
||||
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
#if 0
|
||||
bli_printm( "a", &a, "%5.2f", "" );
|
||||
bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
#endif
|
||||
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
// Repeat the experiment n_repeats times and record results.
|
||||
for ( i = 0; i < n_repeats; ++i )
|
||||
{
|
||||
// Re-pack the contents of b to bp.
|
||||
//bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
// Transpose B to B^T for packing.
|
||||
bli_obj_induce_trans( &b );
|
||||
|
||||
cntl_t* cntl_b = libblis_test_pobj_create
|
||||
(
|
||||
BLIS_NR,
|
||||
BLIS_MR,
|
||||
BLIS_NO_INVERT_DIAG,
|
||||
BLIS_PACKED_COL_PANELS,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&b, &bp,
|
||||
cntx,
|
||||
&rntm
|
||||
);
|
||||
|
||||
// Transpose B^T back to B and Bp^T back to Bp.
|
||||
bli_obj_induce_trans( &b );
|
||||
bli_obj_induce_trans( &bp );
|
||||
|
||||
time = bli_clock();
|
||||
|
||||
libblis_test_trsm_ukr_impl( iface, side,
|
||||
@@ -327,6 +294,10 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
cntx );
|
||||
|
||||
time_min = bli_clock_min_diff( time_min, time );
|
||||
|
||||
// Free the control tree nodes and release their cached mem_t entries
|
||||
// back to the memory broker.
|
||||
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
}
|
||||
|
||||
// Estimate the performance of the best experiment repeat.
|
||||
@@ -339,16 +310,9 @@ bli_printm( "ap", &ap, "%5.2f", "" );
|
||||
// Zero out performance and residual if output matrix is empty.
|
||||
//libblis_test_check_empty_problem( &c, perf, resid );
|
||||
|
||||
#if 0
|
||||
// Free the control tree nodes and release their cached mem_t entries
|
||||
// back to the memory broker.
|
||||
bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
|
||||
#endif
|
||||
|
||||
// Free the packed objects.
|
||||
bli_obj_free( &ap );
|
||||
bli_obj_free( &bp );
|
||||
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
|
||||
|
||||
// Free the test objects.
|
||||
bli_obj_free( &a );
|
||||
|
||||
Reference in New Issue
Block a user