Enable user-customized packm ukernel/variant. (#549)

Details:
- Added four new fields to obj_t: .pack_fn, .pack_params, .ker_fn, and
  .ker_params. These fields store pointers to functions and data that
  will allow the user to more flexibly create custom operations while  
  recycling BLIS's existing partitioning infrastructure.
- Updated typed API to packm variant and structure-aware kernels to 
  replace the diagonal offset with panel offsets, and changed strides 
  of both C and P to inc/ldim semantics. Updated object API to the packm
  variant to include rntm_t*.
- Removed the packm variant function pointer from the packm cntl_t node
  definition since it has been replaced by the .pack_fn pointer in the 
  obj_t.
- Updated bli_packm_int() to read the new packm variant function pointer
  from the obj_t and call it instead of from the cntl_t node.
- Moved some of the logic of bli_l3_packm.c to a new file,
  bli_packm_alloc.c.
- Rewrote bli_packm_blk_var1.c so that it uses byte (char*) pointers
  instead of typed pointers, allowing a single function to be used
  regardless of datatype. This obviated having a separate implementation
  in bli_packm_blk_var1_md.c. Also relegated handling of scalars to a 
  new function, bli_packm_scalar().
- Employed a new standard whereby right-hand matrix operands ("B") are
  always packed as column-stored row panels -- that is, identically to 
  that of left-hand matrix operands ("A"). This means that while we pack
  matrix A normally, we actually pack B in a transposed state. This
  allowed us to simplify a lot of code throughout the framework, and
  also affected some of the logic in bli_l3_packa() and _packb().
- Simplified bli_packm_init.c in light of the new B^T convention
  described above. bli_packm_init()--which is now called from within
  bli_packm_blk_var1()--also now calls bli_packm_alloc() and returns
  a bool that indicates whether packing should be performed (or
  skipped).
- Consolidated bli_gemm_int() and bli_trsm_int() into a bli_l3_int(),
  which, among other things, defaults the new .pack_fn field of the 
  obj_t to bli_packm_blk_var1() if the field is NULL.
- Defined a new function, bli_obj_reset_origin(), which permanently
  refocuses the view of an object so that it "forgets" any offsets from 
  its original pointer. This function also sets the object's root field 
  to itself. Calls to bli_obj_reset_origin() for each matrix operand
  appear in the _front() functions, after the obj_t's are aliased. This
  resetting of the underlying matrices' origins is needed in preparation
  for more advanced features from within custom packm kernels.
- Redefined bli_pba_rntm_set_pba() from a regular function to a static 
  inline function.
- Updated gemm_ukr, gemmtrsm_ukr, and trsm_ukr testsuite modules to use
  libblis_test_pobj_create() to create local packed objects. Previously,
  these packed objects were created by calling lower-level functions.
This commit is contained in:
Devin Matthews
2021-12-02 17:10:03 -06:00
committed by GitHub
parent e229e049ca
commit cf7d616a2f
71 changed files with 1268 additions and 3692 deletions

View File

@@ -1307,7 +1307,6 @@ bli_pba_init_pools
bli_pba_pool_size
bli_pba_query
bli_pba_release
bli_pba_rntm_set_pba
bli_memsys_finalize
bli_memsys_init
bli_mkherm

View File

@@ -50,21 +50,23 @@
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
cntx_t* cntx, \
void* params \
);
INSERT_GENTDEF( packm )

View File

@@ -48,6 +48,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
obj_t* a, \
obj_t* p, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);

View File

@@ -33,15 +33,15 @@
*/
#include "bli_packm_alloc.h"
#include "bli_packm_cntl.h"
#include "bli_packm_check.h"
#include "bli_packm_init.h"
#include "bli_packm_int.h"
#include "bli_packm_scalar.h"
#include "bli_packm_part.h"
#include "bli_packm_var.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_1er.h"
@@ -50,6 +50,8 @@
// Mixed datatype support.
#ifdef BLIS_ENABLE_GEMM_MD
#include "bli_packm_md.h"
#include "bli_packm_struc_cxk_md.h"
#endif
#include "bli_packm_blk_var1.h"

View File

@@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -33,78 +33,67 @@
*/
//
// Prototype object-based interfaces.
//
#include "blis.h"
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* c, \
obj_t* p, \
cntx_t* cntx, \
cntl_t* cntl, \
thrinfo_t* t \
);
void* bli_packm_alloc
(
siz_t size_needed,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Query the pack buffer type from the control tree node.
packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
GENPROT( packm_unb_var1 )
GENPROT( packm_blk_var1 )
// Query the address of the mem_t entry within the control tree node.
mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
//
// Prototype BLAS-like interfaces with void pointer operands.
//
mem_t* local_mem_p;
mem_t local_mem_s;
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
cntx_t* cntx \
);
siz_t cntl_mem_size = 0;
INSERT_GENTPROT_BASIC0( packm_unb_var1 )
if ( bli_mem_is_alloc( cntl_mem_p ) )
cntl_mem_size = bli_mem_size( cntl_mem_p );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool invdiag, \
bool revifup, \
bool reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void_fp packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
);
if ( cntl_mem_size < size_needed )
{
if ( bli_thread_am_ochief( thread ) )
{
// The chief thread releases the existing block associated with
// the mem_t entry in the control tree, and then re-acquires a
// new block, saving the associated mem_t entry to local_mem_s.
if ( bli_mem_is_alloc( cntl_mem_p ) )
{
bli_pba_release
(
rntm,
cntl_mem_p
);
}
bli_pba_acquire_m
(
rntm,
size_needed,
pack_buf_type,
&local_mem_s
);
}
INSERT_GENTPROT_BASIC0( packm_blk_var1 )
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
*cntl_mem_p = *local_mem_p;
// Barrier so that the master thread doesn't return from the function
// before we are done reading.
bli_thread_barrier( thread );
}
return bli_mem_buffer( cntl_mem_p );
}

View File

@@ -5,7 +5,6 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -33,13 +32,11 @@
*/
void bli_l3_packm
(
obj_t* x,
obj_t* x_pack,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
BLIS_EXPORT_BLIS void* bli_packm_alloc
(
siz_t size_needed,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);

View File

@@ -35,35 +35,6 @@
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)
(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
pack_t schema,
bool invdiag,
bool revifup,
bool reviflo,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
void_fp packm_ker,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,packm_blk_var1);
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
@@ -79,614 +50,265 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
NULL, bli_zpackm_struc_cxk_1er, } },
};
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
void bli_packm_blk_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* t
thrinfo_t* thread
)
{
#ifdef BLIS_ENABLE_GEMM_MD
// Call a different packm implementation when the storage and target
// datatypes differ.
if ( bli_obj_dt( c ) != bli_obj_target_dt( c ) )
{
bli_packm_blk_var1_md( c, p, cntx, cntl, t );
// Extract various fields from the control tree.
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
// Every thread initializes p and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
// entry in the control tree node). Return early if no packing is required.
if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
return;
}
#endif
num_t dt_p = bli_obj_dt( p );
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_int_check( c, p, cntx );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
bool invdiag = bli_obj_has_inverted_diag( p );
bool revifup = bli_obj_is_pack_rev_if_upper( p );
bool reviflo = bli_obj_is_pack_rev_if_lower( p );
num_t dt_c = bli_obj_dt( c );
dim_t dt_c_size = bli_dt_size( dt_c );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
num_t dt_p = bli_obj_dt( p );
dim_t dt_p_size = bli_dt_size( dt_p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
conj_t conjc = bli_obj_conj_status( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
dim_t iter_dim = bli_obj_length( p );
dim_t panel_len_full = bli_obj_width( p );
dim_t panel_len_max = bli_obj_padded_width( p );
obj_t kappa;
void* buf_kappa;
char* c_cast = bli_obj_buffer_at_off( c );
inc_t incc = bli_obj_row_stride( c );
inc_t ldc = bli_obj_col_stride( c );
dim_t panel_dim_off = bli_obj_row_off( c );
dim_t panel_len_off = bli_obj_col_off( c );
func_t* packm_kers;
void_fp packm_ker;
char* p_cast = bli_obj_buffer( p );
inc_t ldp = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t panel_dim_max = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
FUNCPTR_T f;
doff_t diagoffc_inc = ( doff_t )panel_dim_max;
obj_t kappa_local;
char* kappa_cast = bli_packm_scalar( &kappa_local, p );
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
// we use the default lookup table to determine the right func_t
// for the current schema.
func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
// Query the datatype-specific function pointer from the func_t object.
packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
// For mixed-precision gemm, select the proper kernel (only dense panels).
if ( dt_c != dt_p )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
}
else // if ( bli_is_ind_packed( schema ) )
{
obj_t* kappa_p;
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
// Query the address of the packm params field of the obj_t. The user might
// have set this field in order to specify a custom packm kernel.
packm_blk_var1_params_t* params = bli_obj_pack_params( c );
if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
{
// Query the user-provided packing kernel from the obj_t. If provided,
// this overrides the kernel determined above.
packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
}
/* Compute the total number of iterations we'll need. */
dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */
dim_t ic0, ip0;
doff_t ic_inc, ip_inc;
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) ||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) )
{
ic0 = (n_iter - 1) * panel_dim_max;
ic_inc = -panel_dim_max;
ip0 = n_iter - 1;
ip_inc = -1;
}
else
{
ic0 = 0;
ic_inc = panel_dim_max;
ip0 = 0;
ip_inc = 1;
}
// Query the number of threads and thread ids from the current thread's
// packm thrinfo_t node.
const dim_t nt = bli_thread_n_way( thread );
const dim_t tid = bli_thread_work_id( thread );
// Determine the thread range and increment using the current thread's
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
// will depend on whether slab or round-robin partitioning was requested
// at configure-time.
dim_t it_start, it_end, it_inc;
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
char* p_begin = p_cast;
// Iterate over every logical micropanel in the source matrix.
for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter;
ic += ic_inc, ip += ip_inc, it += 1 )
{
dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic );
dim_t panel_dim_off_i = panel_dim_off + ic;
doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc;
char* c_begin = c_cast + (ic )*incc*dt_c_size;
inc_t p_inc = ps_p;
// NOTE: We MUST use round-robin partitioning when packing
// micropanels of a triangular matrix. Hermitian/symmetric
// and general packing may use slab or round-robin, depending
// on which was selected at configure-time.
// The definition of bli_packm_my_iter() will depend on whether slab
// or round-robin partitioning was requested at configure-time.
bool my_iter = bli_is_triangular( strucc )
? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
: bli_packm_my_iter ( it, it_start, it_end, tid, nt );
if ( bli_is_triangular( strucc ) &&
bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )
{
//printf( "applying non-zero imag kappa\n" );
// This case executes if the panel belongs to a triangular
// matrix AND is completely unstored (ie: zero). If the panel
// is unstored, we do nothing. (Notice that we don't even
// increment p_begin.)
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
continue;
}
else if ( bli_is_triangular( strucc ) &&
bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
{
// This case executes if the panel belongs to a triangular
// matrix AND is diagonal-intersecting. Notice that we
// cannot bury the following conditional logic into
// packm_struc_cxk() because we need to know the value of
// panel_len_max_i so we can properly increment p_inc.
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
// Sanity check. Diagonals should not intersect the short end of
// a micro-panel. If they do, then somehow the constraints on
// cache blocksizes being a whole multiple of the register
// blocksizes was somehow violated.
if ( diagoffc_i < 0 )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
kappa_p = &kappa;
dim_t panel_off_i;
dim_t panel_len_i;
dim_t panel_len_max_i;
if ( bli_is_lower( uploc ) )
{
panel_off_i = 0;
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i;
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max,
panel_len_max );
}
else // if ( bli_is_upper( uploc ) )
{
panel_off_i = bli_abs( diagoffc_i );
panel_len_i = panel_len_full - panel_off_i;
panel_len_max_i = panel_len_max - panel_off_i;
}
dim_t panel_len_off_i = panel_off_i + panel_len_off;
char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size;
char* p_use = p_begin;
// We need to re-compute the imaginary stride as a function of
// panel_len_max_i since triangular packed matrices have panels
// of varying lengths. NOTE: This imaginary stride value is
// only referenced by the packm kernels for induced methods.
inc_t is_p_use = ldp * panel_len_max_i;
// We nudge the imaginary stride up by one if it is odd.
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
if ( my_iter )
{
packm_ker_cast( strucc,
diagc,
uploc,
conjc,
schema,
invdiag,
panel_dim_i,
panel_len_i,
panel_dim_max,
panel_len_max_i,
panel_dim_off_i,
panel_len_off_i,
kappa_cast,
c_use, incc, ldc,
p_use, ldp,
is_p_use,
cntx,
params );
}
// NOTE: This value is usually LESS than ps_p because triangular
// matrices usually have several micro-panels that are shorter
// than a "full" micro-panel.
p_inc = is_p_use;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
// This case executes if the panel is either dense, or belongs
// to a Hermitian or symmetric matrix, which includes stored,
// unstored, and diagonal-intersecting panels.
if ( my_iter )
{
packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
diagc,
uploc,
conjc,
schema,
invdiag,
panel_dim_i,
panel_len_full,
panel_dim_max,
panel_len_max,
panel_dim_off_i,
panel_len_off,
kappa_cast,
c_begin, incc, ldc,
p_begin, ldp, is_p,
cntx,
params );
}
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
p_begin += p_inc*dt_p_size;
}
// The original idea here was to read the packm_ukr from the context
// if it is non-NULL. The problem is, it requires that we be able to
// assume that the packm_ukr field is initialized to NULL, which it
// currently is not.
//func_t* cntx_packm_kers = bli_cntx_get_packm_ukr( cntx );
//if ( bli_func_is_null_dt( dt_c, cntx_packm_kers ) )
{
// If the packm structure-aware kernel func_t in the context is
// NULL (which is the default value after the context is created),
// we use the default lookup table to determine the right func_t
// for the current schema.
const dim_t i = bli_pack_schema_index( schema );
packm_kers = &packm_struc_cxk_kers[ i ];
}
#if 0
else // cntx's packm func_t overrides
{
// If the packm structure-aware kernel func_t in the context is
// non-NULL (ie: assumed to be valid), we use that instead.
//packm_kers = bli_cntx_packm_ukrs( cntx );
packm_kers = cntx_packm_kers;
}
#endif
// Query the datatype-specific function pointer from the func_t object.
packm_ker = bli_func_get_dt( dt_p, packm_kers );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_p];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
schema,
invdiag,
revifup,
reviflo,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
packm_ker,
cntx,
t );
}
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
pack_t schema, \
bool invdiag, \
bool revifup, \
bool reviflo, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
void_fp packm_ker, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
PASTECH2(ch,opname,_ker_ft) packm_ker_cast = packm_ker; \
\
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
dim_t panel_off_i; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool row_stored; \
bool col_stored; \
inc_t is_p_use; \
\
ctype* restrict c_use; \
ctype* restrict p_use; \
doff_t diagoffp_i; \
\
\
/* If C is zeros and part of a triangular matrix, then we don't need
to pack it. */ \
if ( bli_is_zeros( uploc ) && \
bli_is_triangular( strucc ) ) return; \
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t )panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
{ \
ic0 = (n_iter - 1) * panel_dim_max; \
ic_inc = -panel_dim_max; \
ip0 = n_iter - 1; \
ip_inc = -1; \
} \
else \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, ip = ip0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
c_begin = c_cast + (ic )*vs_c; \
\
if ( bli_is_triangular( strucc ) && \
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is completely unstored (ie: zero). If the panel
is unstored, we do nothing. (Notice that we don't even
increment p_begin.) */ \
\
continue; \
} \
else if ( bli_is_triangular( strucc ) && \
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
{ \
/* This case executes if the panel belongs to a triangular
matrix AND is diagonal-intersecting. Notice that we
cannot bury the following conditional logic into
packm_struc_cxk() because we need to know the value of
panel_len_max_i so we can properly increment p_inc. */ \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc_i < 0 ) || \
( row_stored && diagoffc_i > 0 ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
{ \
panel_off_i = 0; \
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, \
panel_len_max ); \
diagoffp_i = diagoffc_i; \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
{ \
panel_off_i = bli_abs( diagoffc_i ); \
panel_len_i = panel_len_full - panel_off_i; \
panel_len_max_i = panel_len_max - panel_off_i; \
diagoffp_i = 0; \
} \
\
c_use = c_begin + (panel_off_i )*ldc; \
p_use = p_begin; \
\
/* We need to re-compute the imaginary stride as a function of
panel_len_max_i since triangular packed matrices have panels
of varying lengths. NOTE: This imaginary stride value is
only referenced by the packm kernels for induced methods. */ \
is_p_use = ldp * panel_len_max_i; \
\
/* We nudge the imaginary stride up by one if it is odd. */ \
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); \
\
/* NOTE: We MUST use round-robin partitioning when packing
micropanels of a triangular matrix. Hermitian/symmetric
and general packing may use slab or round-robin, depending
on which was selected at configure-time. */ \
if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffp_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is usually LESS than ps_p because triangular
matrices usually have several micro-panels that are shorter
than a "full" micro-panel. */ \
p_inc = is_p_use; \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* This case executes if the panel belongs to a Hermitian or
symmetric matrix, which includes stored, unstored, and
diagonal-intersecting panels. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( strucc, \
diagoffc_i, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
p_inc = ps_p; \
} \
else \
{ \
/* This case executes if the panel is general, or, if the
panel is part of a triangular matrix and is neither unstored
(ie: zero) nor diagonal-intersecting. */ \
\
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
is_p_use = is_p; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
packm_ker_cast( BLIS_GENERAL, \
0, \
diagc, \
BLIS_DENSE, \
conjc, \
schema, \
invdiag, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p_use, \
cntx ); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNCR_BASIC( packm, packm_blk_var1 )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -0,0 +1,59 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// packm params types.
//
typedef struct
{
// Type of C Type of P
packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
} packm_blk_var1_params_t;
//
// Prototype object-based interfaces.
//
BLIS_EXPORT_BLIS void bli_packm_blk_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* t
);

View File

@@ -1,344 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_GEMM_MD
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)(
trans_t transc,
pack_t schema,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
inc_t is_p,
dim_t pd_p, inc_t ps_p,
cntx_t* cntx,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
void bli_packm_blk_var1_md
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
)
{
num_t dt_c = bli_obj_dt( c );
num_t dt_p = bli_obj_dt( p );
trans_t transc = bli_obj_conjtrans_status( c );
pack_t schema = bli_obj_pack_schema( p );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
obj_t kappa;
void* buf_kappa;
FUNCPTR_T f;
// Treatment of kappa (ie: packing during scaling) depends on
// whether we are executing an induced method.
if ( bli_is_nat_packed( schema ) )
{
// This branch is for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
}
else // if ( bli_is_ind_packed( schema ) )
{
obj_t* kappa_p;
// The value for kappa we use will depend on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) )
{
// Detach the scalar.
bli_obj_scalar_detach( p, &kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
kappa_p = &kappa;
}
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
kappa_p = &BLIS_ONE;
}
// Acquire the buffer to the kappa chosen above.
buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_c][dt_p];
// Invoke the function.
f(
transc,
schema,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
is_p,
pd_p, ps_p,
cntx,
t );
}
#undef GENTFUNC2
#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
\
void PASTEMAC2(chc,chp,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* cntx, \
thrinfo_t* thread \
) \
{ \
ctype_p* restrict kappa_cast = kappa; \
ctype_c* restrict c_cast = c; \
ctype_p* restrict p_cast = p; \
ctype_c* restrict c_begin; \
ctype_p* restrict p_begin; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic, ip; \
doff_t ic_inc, ip_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
inc_t vs_c; \
inc_t p_inc; \
dim_t* m_panel_use; \
dim_t* n_panel_use; \
dim_t* m_panel_max; \
dim_t* n_panel_max; \
conj_t conjc; \
bool row_stored; \
bool col_stored; \
\
ctype_c* restrict c_use; \
ctype_p* restrict p_use; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
\
( void )col_stored; \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
vs_c = cs_c; \
m_panel_use = &panel_len_i; \
n_panel_use = &panel_dim_i; \
m_panel_max = &panel_len_max_i; \
n_panel_max = &panel_dim_max; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
vs_c = rs_c; \
m_panel_use = &panel_dim_i; \
n_panel_use = &panel_len_i; \
m_panel_max = &panel_dim_max; \
n_panel_max = &panel_len_max_i; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
{ \
ic_inc = panel_dim_max; \
ip_inc = 1; \
} \
\
p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress unused variable warnings when slab partitioning is enabled,
since the slab-based definition of bli_packm_my_iter() does not
actually use tid or nt. */ \
( void )nt; ( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
for ( ic = 0, ip = 0, it = 0; it < n_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
c_begin = c_cast + (ic )*vs_c; \
\
{ \
c_use = c_begin; \
p_use = p_begin; \
\
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC2(chc,chp,packm_struc_cxk_md) \
( \
conjc, \
schema, \
*m_panel_use, \
*n_panel_use, \
*m_panel_max, \
*n_panel_max, \
kappa_cast, \
c_use, rs_c, cs_c, \
p_use, rs_p, cs_p, \
is_p, \
cntx \
); \
} \
\
p_inc = ps_p; \
} \
\
/*
if ( row_stored ) \
PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
p_begin += p_inc; \
\
} \
}
INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
#endif

View File

@@ -1,67 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_blk_var1_md
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* t
);
#undef GENTPROT2
#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
\
void PASTEMAC2(chc,chp,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
inc_t is_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* cntx, \
thrinfo_t* thread \
);
INSERT_GENTPROT2_BASIC0( packm_blk_var1_md )
INSERT_GENTPROT2_MIXDP0( packm_blk_var1_md )

View File

@@ -35,11 +35,10 @@
#include "blis.h"
cntl_t* bli_packm_cntl_create_node
BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp packm_var_func,
bszid_t bmid_m,
bszid_t bmid_n,
bool does_invert_diag,
@@ -62,7 +61,6 @@ cntl_t* bli_packm_cntl_create_node
// Initialize the packm_params_t struct.
params->size = sizeof( packm_params_t );
params->var_func = packm_var_func;
params->bmid_m = bmid_m;
params->bmid_n = bmid_n;
params->does_invert_diag = does_invert_diag;

View File

@@ -36,7 +36,6 @@
struct packm_params_s
{
uint64_t size; // size field must be present and come first.
packm_var_oft var_func;
bszid_t bmid_m;
bszid_t bmid_n;
bool does_invert_diag;
@@ -47,11 +46,6 @@ struct packm_params_s
};
typedef struct packm_params_s packm_params_t;
BLIS_INLINE packm_var_oft bli_cntl_packm_params_var_func( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->var_func;
}
BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
@@ -93,7 +87,6 @@ cntl_t* bli_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp packm_var_func,
bszid_t bmid_m,
bszid_t bmid_n,
bool does_invert_diag,

View File

@@ -35,12 +35,14 @@
#include "blis.h"
siz_t bli_packm_init
bool bli_packm_init
(
obj_t* a,
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_init_once();
@@ -51,139 +53,27 @@ siz_t bli_packm_init
// suitable block of memory from the memory allocator (if such a block
// of memory has not already been allocated previously).
bszid_t bmult_id_m;
bszid_t bmult_id_n;
bool does_invert_diag;
bool rev_iter_if_upper;
bool rev_iter_if_lower;
pack_t schema;
//packbuf_t pack_buf_type;
siz_t size_needed;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_init_check( a, p, cntx );
bli_packm_init_check( c, p, cntx );
// Extract various fields from the control tree.
bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl );
rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl );
rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl );
schema = bli_cntl_packm_params_pack_schema( cntl );
//pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
#if 0
// Let us now check to see if the object has already been packed. First
// we check if it has been packed to an unspecified (row or column)
// format, in which case we can alias the object and return.
// NOTE: The reason we don't need to even look at the control tree in
// this case is as follows: an object's pack status is only set to
// BLIS_PACKED_UNSPEC for situations when the actual format used is
// not important, as long as its packed into contiguous rows or
// contiguous columns. A good example of this is packing for matrix
// operands in the level-2 operations.
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
{
bli_obj_alias_to( a, p );
return 0;
}
// Now we check if the object has already been packed to the desired
// schema (as encoded in the control tree). If so, we can alias and
// return 0.
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
// and thus packing will be called for (but in some cases packing has
// already taken place, or does not need to take place, and so that will
// be indicated by the pack status). Also, not all combinations of
// current pack status and desired pack schema are valid.
if ( bli_obj_pack_schema( a ) == pack_schema )
{
bli_obj_alias_to( a, p );
return 0;
}
#endif
// We begin by copying the fields of A.
bli_obj_alias_to( c, p );
// If the object is marked as being filled with zeros, then we can skip
// the packm operation entirely and alias.
if ( bli_obj_is_zeros( a ) )
{
bli_obj_alias_to( a, p );
return 0;
}
if ( bli_obj_is_zeros( c ) )
return false;
// Prepare a few other variables based on properties of the control
// tree.
invdiag_t invert_diag;
packord_t pack_ord_if_up;
packord_t pack_ord_if_lo;
if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG;
else invert_diag = BLIS_NO_INVERT_DIAG;
if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER;
else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER;
if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER;
else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER;
// Initialize object p for the final packed matrix.
size_needed
=
bli_packm_init_pack
(
invert_diag,
schema,
pack_ord_if_up,
pack_ord_if_lo,
bmult_id_m,
bmult_id_n,
a,
p,
cntx
);
// Return the size needed for memory allocation of the packed buffer.
return size_needed;
}
siz_t bli_packm_init_pack
(
invdiag_t invert_diag,
pack_t schema,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
bszid_t bmult_id_m,
bszid_t bmult_id_n,
obj_t* a,
obj_t* p,
cntx_t* cntx
)
{
bli_init_once();
num_t dt_tar = bli_obj_target_dt( a );
num_t dt_scalar = bli_obj_scalar_dt( a );
trans_t transa = bli_obj_onlytrans_status( a );
dim_t m_a = bli_obj_length( a );
dim_t n_a = bli_obj_width( a );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx );
dim_t m_p, n_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
inc_t rs_p, cs_p;
inc_t is_p;
// We begin by copying the fields of A.
bli_obj_alias_to( a, p );
// Extract various fields from the control tree.
bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
num_t dt_tar = bli_obj_target_dt( c );
num_t dt_scalar = bli_obj_scalar_dt( c );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
// Typecast the internal scalar value to the target datatype.
// Note that if the typecasting is needed, this must happen BEFORE we
@@ -195,51 +85,21 @@ siz_t bli_packm_init_pack
// Update the storage datatype of P to be the target datatype of A.
bli_obj_set_dt( dt_tar, p );
bli_obj_set_elem_size( bli_dt_size( dt_tar ), p );
// Update the dimension fields to explicitly reflect a transposition,
// if needed.
// Then, clear the conjugation and transposition fields from the object
// since matrix packing in BLIS is deemed to take care of all conjugation
// and transposition necessary.
// Then, we adjust the properties of P when A needs a transposition.
// We negate the diagonal offset, and if A is upper- or lower-stored,
// we either toggle the uplo of P.
// Finally, if we mark P as dense since we assume that all matrices,
// regardless of structure, will be densified.
bli_obj_set_dims_with_trans( transa, m_a, n_a, p );
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p );
if ( bli_does_trans( transa ) )
{
bli_obj_negate_diag_offset( p );
if ( bli_obj_is_upper_or_lower( a ) )
bli_obj_toggle_uplo( p );
}
// Store the pack schema to the object.
bli_obj_set_pack_schema( schema, p );
// If we are packing micropanels, mark P as dense. Otherwise, we are
// probably being called in the context of a level-2 operation, in
// which case we do not want to overwrite the uplo field of P (inherited
// from A) with BLIS_DENSE because that information may be needed by
// the level-2 operation's unblocked variant to decide whether to
// execute a "lower" or "upper" branch of code.
if ( bli_is_panel_packed( schema ) )
{
bli_obj_set_uplo( BLIS_DENSE, p );
}
// Clear the conjugation field from the object since matrix packing
// in BLIS is deemed to take care of all conjugation necessary.
bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
// Since we are packing micropanels, mark P as dense.
bli_obj_set_uplo( BLIS_DENSE, p );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, p );
// Set the invert diagonal field.
bli_obj_set_invert_diag( invert_diag, p );
// Set the pack status of P to the pack schema prescribed in the control
// tree node.
bli_obj_set_pack_schema( schema, p );
// Set the packing order bits.
bli_obj_set_pack_order_if_upper( pack_ord_if_up, p );
bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p );
// Compute the dimensions padded by the dimension multiples. These
// dimensions will be the dimensions of the packed matrices, including
// zero-padding, and will be used by the macro- and micro-kernels.
@@ -247,10 +107,10 @@ siz_t bli_packm_init_pack
// in P) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
m_p = bli_obj_length( p );
n_p = bli_obj_width( p );
m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
@@ -258,177 +118,70 @@ siz_t bli_packm_init_pack
bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
// Now we prepare to compute strides, align them, and compute the
// total number of bytes needed for the packed buffer. The caller
// will then use that value to acquire an appropriate block of memory
// from the memory allocator.
// total number of bytes needed for the packed buffer. Then we use
// that value to acquire an appropriate block of memory from the
// memory allocator.
// Extract the element size for the packed object.
elem_size_p = bli_obj_elem_size( p );
siz_t elem_size_p = bli_obj_elem_size( p );
// Set the row and column strides of p based on the pack schema.
if ( bli_is_row_packed( schema ) &&
!bli_is_panel_packed( schema ) )
{
// For regular row storage, the padded width of our matrix
// should be used for the row stride, with the column stride set
// to one. By using the WIDTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the right edge of
// the matrix.
rs_p = n_p_pad;
cs_p = 1;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the m dimension.
dim_t m_panel = bmult_m_def;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc rows begin at
// aligned addresses.
rs_p = bli_align_dim_to_size( rs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// The "column stride" of a row-micropanel packed object is interpreted
// as the column stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple, which may be equal to the
// default (logical) blocksize multiple).
inc_t cs_p = bmult_m_pack;
// Store the strides in P.
bli_obj_set_strides( rs_p, cs_p, p );
// The "row stride" of a row-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, it is unit.
inc_t rs_p = 1;
// Compute the size of the packed buffer.
size_p = m_p_pad * rs_p * elem_size_p;
}
else if ( bli_is_col_packed( schema ) &&
!bli_is_panel_packed( schema ) )
{
// For regular column storage, the padded length of our matrix
// should be used for the column stride, with the row stride set
// to one. By using the LENGTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the bottom edge of
// the matrix.
cs_p = m_p_pad;
rs_p = 1;
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the m
// dimension of the matrix is not a whole multiple of MR.
inc_t ps_p = cs_p * n_p_pad;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc columns begin at
// aligned addresses.
cs_p = bli_align_dim_to_size( cs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// As a general rule, we don't want micropanel strides to be odd. There
// are very few instances where this can happen, but we've seen it happen
// more than zero times (such as for certain small problems), and so we
// check for it here.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Store the strides in P.
bli_obj_set_strides( rs_p, cs_p, p );
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
inc_t is_p = 1;
// Compute the size of the packed buffer.
size_p = cs_p * n_p_pad * elem_size_p;
}
else if ( bli_is_row_packed( schema ) &&
bli_is_panel_packed( schema ) )
{
dim_t m_panel;
dim_t ps_p;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( m_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_panel, p );
bli_obj_set_panel_width( n_p, p );
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the m dimension.
m_panel = bmult_m_def;
// Compute the size of the packed buffer.
siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
// The "column stride" of a row-micropanel packed object is interpreted
// as the column stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple, which may be equal to the
// default (logical) blocksize multiple).
cs_p = bmult_m_pack;
// If the requested size is zero, then we don't need to do any allocation.
if ( size_p == 0 )
return false;
// The "row stride" of a row-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, it is unit.
rs_p = 1;
// Update the buffer address in p to point to the buffer associated
// with the mem_t entry acquired from the memory broker (now cached in
// the control tree node).
void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
bli_obj_set_buffer( buffer, p );
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the m
// dimension of the matrix is not a whole multiple of MR.
ps_p = cs_p * n_p_pad;
// As a general rule, we don't want micropanel strides to be odd.
// NOTE: This safety feature *may* not be necessary anymore, but was
// definitely needed to support certain variations of the 3m method.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( m_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_panel, p );
bli_obj_set_panel_width( n_p, p );
// Compute the size of the packed buffer.
size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
}
else if ( bli_is_col_packed( schema ) &&
bli_is_panel_packed( schema ) )
{
dim_t n_panel;
dim_t ps_p;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the n dimension.
n_panel = bmult_n_def;
// The "row stride" of a column-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple (which may be equal to the
// default (logical) blocksize multiple.
rs_p = bmult_n_pack;
// The "column stride" of a column-micropanel packed object is
// interpreted as the column stride WITHIN a micropanel. Thus, it is
// unit.
cs_p = 1;
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded length computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the n
// dimension of the matrix is not a whole multiple of NR.
ps_p = m_p_pad * rs_p;
// As a general rule, we don't want micropanel strides to be odd.
// NOTE: This safety feature *may* not be necessary anymore, but was
// definitely needed to support certain variations of the 3m method.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( n_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_p, p );
bli_obj_set_panel_width( n_panel, p );
// Compute the size of the packed buffer.
size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p;
}
else
{
// NOTE: When implementing block storage, we only need to implement
// the following two cases:
// - row-stored blocks in row-major order
// - column-stored blocks in column-major order
// The other two combinations coincide with that of packed row-panel
// and packed column- panel storage.
size_p = 0;
}
return size_p;
return true;
}

View File

@@ -32,24 +32,13 @@
*/
siz_t bli_packm_init
BLIS_EXPORT_BLIS bool bli_packm_init
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl
);
BLIS_EXPORT_BLIS siz_t bli_packm_init_pack
(
invdiag_t invert_diag,
pack_t schema,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
bszid_t bmult_id_m,
bszid_t bmult_id_n,
obj_t* a,
obj_t* p,
cntx_t* cntx
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);

View File

@@ -39,59 +39,19 @@ void bli_packm_int
obj_t* a,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_init_once();
packm_var_oft f;
// Extract the function pointer from the object.
packm_var_oft f = bli_obj_pack_fn( a );
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_int_check( a, p, cntx );
// Sanity check; A should never have a zero dimension. If we must support
// it, then we should fold it into the next alias-and-early-exit block.
//if ( bli_obj_has_zero_dim( a ) ) bli_abort();
// Let us now check to see if the object has already been packed. First
// we check if it has been packed to an unspecified (row or column)
// format, in which case we can return, since by now aliasing has already
// taken place in packm_init().
// NOTE: The reason we don't need to even look at the control tree in
// this case is as follows: an object's pack status is only set to
// BLIS_PACKED_UNSPEC for situations when the actual format used is
// not important, as long as its packed into contiguous rows or
// contiguous columns. A good example of this is packing for matrix
// operands in the level-2 operations.
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
{
return;
}
// At this point, we can be assured that cntl is not NULL. Now we check
// if the object has already been packed to the desired schema (as en-
// coded in the control tree). If so, we can return, as above.
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
// and thus packing will be called for (but in some cases packing has
// already taken place, or does not need to take place, and so that will
// be indicated by the pack status). Also, not all combinations of
// current pack status and desired pack schema are valid.
if ( bli_obj_pack_schema( a ) == bli_cntl_packm_params_pack_schema( cntl ) )
{
return;
}
// If the object is marked as being filled with zeros, then we can skip
// the packm operation entirely.
if ( bli_obj_is_zeros( a ) )
{
return;
}
// Extract the function pointer from the current control tree node.
f = bli_cntl_packm_params_var_func( cntl );
// Barrier so that we know threads are done with previous computation
// with the same packing buffer before starting to pack.
bli_thread_barrier( thread );
// Invoke the variant with kappa_use.
f
@@ -99,8 +59,12 @@ void bli_packm_int
a,
p,
cntx,
rntm,
cntl,
thread
);
// Barrier so that packing is done before computation.
bli_thread_barrier( thread );
}

View File

@@ -37,6 +37,7 @@ void bli_packm_int
obj_t* a,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -34,83 +35,42 @@
#include "blis.h"
void bli_trsm_packa
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
void* bli_packm_scalar( obj_t* kappa, obj_t* p )
{
obj_t a_pack;
num_t dt_p = bli_obj_dt( p );
pack_t schema = bli_obj_pack_schema( p );
// Pack matrix A according to the control tree node.
bli_l3_packm
(
a,
&a_pack,
cntx,
rntm,
cntl,
thread
);
// The value for kappa we use will depends on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) &&
!bli_is_nat_packed( schema ) )
{
//printf( "applying non-zero imag kappa\n_p" );
// Proceed with execution using packed matrix A.
bli_trsm_int
(
&BLIS_ONE,
&a_pack,
b,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
// -----------------------------------------------------------------------------
void bli_trsm_packb
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t b_pack;
// Pack matrix B according to the control tree node.
bli_l3_packm
(
b,
&b_pack,
cntx,
rntm,
cntl,
thread
);
// Proceed with execution using packed matrix B.
bli_trsm_int
(
&BLIS_ONE,
a,
&b_pack,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
// Detach the scalar.
bli_obj_scalar_detach( p, kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
return bli_obj_buffer_for_1x1( dt_p, kappa );
}
// This branch is also for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE );
}
}

View File

@@ -32,6 +32,5 @@
*/
#include "bli_packm_blk_var1_md.h"
#include "bli_packm_struc_cxk_md.h"
BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p );

View File

@@ -40,57 +40,24 @@
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
@@ -118,23 +85,21 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,packm_herm_cxk) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
invdiag, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
@@ -145,130 +110,24 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,packm_tri_cxk) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
\
\
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
fill the edge region (the bottom m_panel_max - m_panel rows or right-
side n_panel_max - n_panel columns) of the micropanel with zeros.
However, this responsibility has been moved to the packm microkernel.
This change allows experts to use custom kernels that pack to custom
packing formats when the problem size is not a nice multiple of the
register blocksize. */ \
\
/*
if ( m_panel != m_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t i = m_panel; \
dim_t m_edge = m_panel_max - i; \
dim_t n_edge = n_panel_max; \
ctype* p_edge = p + (i )*rs_p; \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t j = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - j; \
ctype* p_edge = p + (j )*cs_p; \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
*/ \
\
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t i = m_panel; \
dim_t j = n_panel; \
dim_t m_br = m_panel_max - i; \
dim_t n_br = n_panel_max - j; \
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one, \
p_br, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
\
\
/*
if ( bli_is_col_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
else if ( bli_is_row_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
@@ -282,42 +141,31 @@ INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc_abs; \
dim_t i, j; \
bool row_stored; \
bool col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs; \
dim_t i, j; \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
@@ -325,10 +173,10 @@ void PASTEMAC(ch,varname) \
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
c = c + diagoffc * ( doff_t )ldc + \
-diagoffc * ( doff_t )incc; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
@@ -350,7 +198,7 @@ void PASTEMAC(ch,varname) \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
@@ -370,14 +218,12 @@ void PASTEMAC(ch,varname) \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
if ( diagoffc < 0 ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
if ( bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
@@ -393,8 +239,8 @@ void PASTEMAC(ch,varname) \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
-diagoffc12 * ( doff_t )incc; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
@@ -402,16 +248,15 @@ void PASTEMAC(ch,varname) \
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
else /* if ( bli_is_upper( uploc ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
-diagoffc10 * ( doff_t )incc; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
@@ -486,8 +331,8 @@ void PASTEMAC(ch,varname) \
transc, \
p11_m, \
p11_n, \
c11, rs_c, cs_c, \
p11, rs_p, cs_p, \
c11, incc, ldc, \
p11, 1, ldp, \
cntx, \
NULL \
); \
@@ -503,7 +348,7 @@ void PASTEMAC(ch,varname) \
{ \
PASTEMAC(ch,seti0s)( *pi11 ); \
\
pi11 += rs_p + cs_p; \
pi11 += 1 + ldp; \
} \
} \
\
@@ -519,7 +364,7 @@ void PASTEMAC(ch,varname) \
p11_m, \
p11_n, \
kappa, \
p11, rs_p, cs_p, \
p11, 1, ldp, \
cntx, \
NULL \
); \
@@ -539,28 +384,26 @@ INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
\
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
@@ -584,11 +427,11 @@ void PASTEMAC(ch,varname) \
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
m_panel, \
n_panel, \
diagoffc, \
panel_dim, \
panel_len, \
kappa, \
p, rs_p, cs_p, \
p, 1, ldp, \
cntx, \
NULL \
); \
@@ -599,10 +442,10 @@ void PASTEMAC(ch,varname) \
{ \
PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
( \
diagoffp, \
m_panel, \
n_panel, \
p, rs_p, cs_p, \
diagoffc, \
panel_dim, \
panel_len, \
p, 1, ldp, \
cntx, \
NULL \
); \
@@ -621,23 +464,53 @@ void PASTEMAC(ch,varname) \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
diagoffc, \
BLIS_NONUNIT_DIAG, \
uplop, \
m_panel, \
n_panel, \
panel_dim, \
panel_len, \
zero, \
p, rs_p, cs_p, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( panel_dim != panel_dim_max && \
panel_len != panel_len_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t i = panel_dim; \
dim_t j = panel_len; \
dim_t m_br = panel_dim_max - i; \
dim_t n_br = panel_len_max - j; \
ctype* p_br = p + (i ) + (j )*ldp; \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one, \
p_br, 1, ldp, \
cntx, \
NULL \
); \
} \
}
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )

View File

@@ -38,84 +38,25 @@
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_struc_cxk )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_herm_cxk )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_tri_cxk )

View File

@@ -40,57 +40,25 @@
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
cntx_t* cntx, \
void* params \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
@@ -108,7 +76,7 @@ void PASTEMAC(ch,varname) \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
@@ -118,24 +86,23 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,packm_herm_cxk_1er) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
invdiag, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx, \
params \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
@@ -145,125 +112,25 @@ void PASTEMAC(ch,varname) \
PASTEMAC(ch,packm_tri_cxk_1er) \
( \
strucc, \
diagoffc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
m_panel, \
n_panel, \
m_panel_max, \
n_panel_max, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, rs_c, cs_c, \
incc, ldc, \
p, rs_p, cs_p, \
ldp, \
cntx \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx, \
params \
); \
} \
\
\
/* If m_panel < m_panel_max, or n_panel < n_panel_max, we would normally
fill the edge region (the bottom m_panel_max - m_panel rows or right-
side n_panel_max - n_panel columns) of the micropanel with zeros.
However, this responsibility has been moved to the packm microkernel.
This change allows experts to use custom kernels that pack to custom
packing formats when the problem size is not a nice multiple of the
register blocksize. */ \
/*
if ( m_panel != m_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t offm = m_panel; \
dim_t offn = 0; \
dim_t m_edge = m_panel_max - m_panel; \
dim_t n_edge = n_panel_max; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, rs_p, cs_p, ldp \
); \
} \
\
if ( n_panel != n_panel_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
dim_t offm = 0; \
dim_t offn = n_panel; \
dim_t m_edge = m_panel_max; \
dim_t n_edge = n_panel_max - n_panel; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, rs_p, cs_p, ldp \
); \
} \
*/ \
\
if ( bli_is_triangular( strucc ) ) \
{ \
/* If this micro-panel is an edge case in both panel dimension and
length, then it must be a bottom-right corner case, which
typically only happens for micro-panels being packed for trsm.
(It also happens for trmm if kr > 1.) Here, we set the part of
the diagonal that extends into the zero-padded region to
identity. This prevents NaNs and Infs from creeping into the
computation. If this code does execute for trmm, it is okay,
because those 1.0's that extend into the bottom-right region
end up getting muliplied by the 0.0's in the zero-padded region
of the other matrix. */ \
if ( m_panel != m_panel_max && \
n_panel != n_panel_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t offm = m_panel; \
dim_t offn = n_panel; \
dim_t m_edge = m_panel_max - m_panel; \
dim_t n_edge = n_panel_max - n_panel; \
\
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
one, \
p, rs_p, cs_p, ldp \
); \
} \
} \
\
\
/*
if ( bli_is_1r_packed( schema ) ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1r): bp", m_panel_max, 2*n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
} \
\
if ( bli_is_1e_packed( schema ) ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_struc_cxk_1er (1e): ap", 2*m_panel_max, 2*n_panel_max, \
( ctype_r* )p, rs_p, cs_p, "%4.1f", "" ); \
} \
*/ \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
@@ -277,42 +144,32 @@ INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
doff_t diagoffc_abs; \
dim_t j; \
bool row_stored; \
bool col_stored; \
\
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
row_stored = bli_is_col_packed( schema ); \
col_stored = bli_is_row_packed( schema ); \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs; \
dim_t j; \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
@@ -320,10 +177,10 @@ void PASTEMAC(ch,varname) \
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
{ \
c = c + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
c = c + diagoffc * ( doff_t )ldc + \
-diagoffc * ( doff_t )incc; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
@@ -345,7 +202,7 @@ void PASTEMAC(ch,varname) \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
@@ -366,14 +223,12 @@ void PASTEMAC(ch,varname) \
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( ( col_stored && diagoffc < 0 ) || \
( row_stored && diagoffc > 0 ) ) \
if ( diagoffc < 0 ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( ( row_stored && bli_is_upper( uploc ) ) || \
( col_stored && bli_is_lower( uploc ) ) ) \
if ( bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
@@ -389,8 +244,8 @@ void PASTEMAC(ch,varname) \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
-diagoffc12 * ( doff_t )rs_c; \
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
-diagoffc12 * ( doff_t )incc; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
@@ -398,16 +253,15 @@ void PASTEMAC(ch,varname) \
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( ( row_stored && bli_is_lower( uploc ) ) || \
( col_stored && bli_is_upper( uploc ) ) ) */ \
else /* if ( bli_is_upper( uploc ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
-diagoffc10 * ( doff_t )rs_c; \
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
-diagoffc10 * ( doff_t )incc; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
@@ -478,8 +332,8 @@ void PASTEMAC(ch,varname) \
conjc, \
panel_dim, \
kappa, \
c11, rs_c, cs_c, \
p11, rs_p, cs_p, ldp \
c11, incc, ldc, \
p11, 1, ldp, ldp \
); \
\
/* If we are packing a micro-panel with Hermitian structure,
@@ -495,8 +349,8 @@ void PASTEMAC(ch,varname) \
if ( bli_is_hermitian( strucc ) ) \
{ \
ctype_r* restrict c11_r = ( ctype_r* )c11; \
const dim_t rs_c2 = 2*rs_c; \
const dim_t cs_c2 = 2*cs_c; \
const dim_t incc2 = 2*incc; \
const dim_t ldc2 = 2*ldc; \
\
PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
( \
@@ -504,8 +358,8 @@ void PASTEMAC(ch,varname) \
panel_dim, \
panel_dim, \
kappa, \
c11_r, rs_c2, cs_c2, \
p11, rs_p, cs_p, ldp \
c11_r, incc2, ldc2, \
p11, 1, ldp, ldp \
); \
} \
} \
@@ -523,30 +377,28 @@ INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
doff_t diagoffp_abs = bli_abs( diagoffp ); \
ctype* p11 = p + (diagoffp_abs )*ldp; \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs = bli_abs( diagoffc ); \
ctype* p11 = p + (diagoffc_abs )*ldp; \
\
\
/* Pack the panel. */ \
@@ -579,7 +431,7 @@ void PASTEMAC(ch,varname) \
panel_dim, \
panel_dim, \
kappa, \
p11, rs_p, cs_p, ldp \
p11, 1, ldp, ldp \
); \
} \
\
@@ -594,7 +446,7 @@ void PASTEMAC(ch,varname) \
0, \
panel_dim, \
panel_dim, \
p11, rs_p, cs_p, ldp \
p11, 1, ldp, ldp \
); \
} \
\
@@ -610,11 +462,11 @@ void PASTEMAC(ch,varname) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
doff_t diagoffp11_0 = 0; \
doff_t diagoffc11_0 = 0; \
dim_t p11_0_dim = panel_dim - 1; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp11_0 ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \
\
/* Note that this macro works a little differently than the setm
operation. Here, we pass in the dimensions of only p11, rather
@@ -622,20 +474,51 @@ void PASTEMAC(ch,varname) \
"shrunken" dimensions of p11, corresponding to the toggling
and shrinking of the diagonal above. The macro will do the
right thing, incrementing the pointer to p11 by the appropriate
leading dimension (cs_p or rs_p), and setting only the lower
leading dimension (ldp or rs_p), and setting only the lower
or upper triangle to zero. */ \
PASTEMAC(ch,set1ms_mxn_uplo) \
( \
schema, \
diagoffp11_0, \
diagoffc11_0, \
uplop, \
p11_0_dim, \
p11_0_dim, \
zero, \
p11, rs_p, cs_p, ldp \
p11, 1, ldp, ldp \
); \
} \
} \
\
/* If this micro-panel is an edge case in both panel dimension and
length, then it must be a bottom-right corner case, which
typically only happens for micro-panels being packed for trsm.
(It also happens for trmm if kr > 1.) Here, we set the part of
the diagonal that extends into the zero-padded region to
identity. This prevents NaNs and Infs from creeping into the
computation. If this code does execute for trmm, it is okay,
because those 1.0's that extend into the bottom-right region
end up getting muliplied by the 0.0's in the zero-padded region
of the other matrix. */ \
if ( panel_dim != panel_dim_max && \
panel_len != panel_len_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t offm = panel_dim; \
dim_t offn = panel_len; \
dim_t m_edge = panel_dim_max - panel_dim; \
dim_t n_edge = panel_len_max - panel_len; \
\
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
one, \
p, 1, ldp, ldp \
); \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )

View File

@@ -38,84 +38,26 @@
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffp, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
cntx_t* cntx, \
void* params \
);
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er )
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er )

View File

@@ -41,53 +41,26 @@
\
void PASTEMAC2(chc,chp,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype_p* restrict kappa, \
ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
ctype_c* restrict c, inc_t incc, inc_t ldc, \
ctype_p* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
cntx_t* cntx, \
void* params \
) \
{ \
dim_t panel_dim; \
dim_t panel_dim_max; \
dim_t panel_len; \
dim_t panel_len_max; \
inc_t incc, ldc; \
inc_t ldp; \
\
\
/* Determine the dimensions and relative strides of the micro-panel
based on its pack schema. */ \
if ( bli_is_col_packed( schema ) ) \
{ \
/* Prepare to pack to row-stored column panel. */ \
panel_dim = n_panel; \
panel_dim_max = n_panel_max; \
panel_len = m_panel; \
panel_len_max = m_panel_max; \
incc = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( bli_is_row_packed( schema ) ) */ \
{ \
/* Prepare to pack to column-stored row panel. */ \
panel_dim = m_panel; \
panel_dim_max = m_panel_max; \
panel_len = n_panel; \
panel_len_max = n_panel_max; \
incc = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
\
if ( bli_is_nat_packed( schema ) ) \
{ \
/* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha
@@ -318,7 +291,7 @@ void PASTEMAC2(cha,chp,opname) \
conj_t conja, \
dim_t m, \
dim_t n, \
ctype_p* restrict kappa, \
ctype_p* restrict kappa, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_p* restrict p, inc_t ldp \
) \
@@ -445,7 +418,7 @@ void PASTEMAC2(cha,chp,opname) \
conj_t conja, \
dim_t m, \
dim_t n, \
ctype_p* restrict kappa, \
ctype_p* restrict kappa, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_p* restrict p, inc_t ldp \
) \

View File

@@ -37,17 +37,24 @@
\
void PASTEMAC2(chc,chp,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
dim_t m_panel, \
dim_t n_panel, \
dim_t m_panel_max, \
dim_t n_panel_max, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype_p* restrict kappa, \
ctype_c* restrict c, inc_t rs_c, inc_t cs_c, \
ctype_p* restrict p, inc_t rs_p, inc_t cs_p, \
ctype_c* restrict c, inc_t incc, inc_t ldc, \
ctype_p* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
cntx_t* cntx, \
void* params \
);
INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )

View File

@@ -1,297 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
dim_t m,
dim_t n,
dim_t m_max,
dim_t n_max,
void* kappa,
void* c, inc_t rs_c, inc_t cs_c,
void* p, inc_t rs_p, inc_t cs_p,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
void bli_packm_unb_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_cp = bli_obj_dt( c );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
trans_t transc = bli_obj_conjtrans_status( c );
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_max_p = bli_obj_padded_length( p );
dim_t n_max_p = bli_obj_padded_width( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
void* buf_kappa;
FUNCPTR_T f;
// This variant assumes that the computational kernel will always apply
// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
// for kappa so that the underlying packm implementation does not scale
// during packing.
buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
if( bli_thread_am_ochief( thread ) ) {
// Invoke the function.
f
(
strucc,
diagoffc,
diagc,
uploc,
transc,
m_p,
n_p,
m_max_p,
n_max_p,
buf_kappa,
buf_c, rs_c, cs_c,
buf_p, rs_p, cs_p,
cntx
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
cntx_t* cntx \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict zero = PASTEMAC(ch,0); \
\
/* We begin by packing the region indicated by the parameters. If
matrix c is dense (either because the structure is general or
because the structure has already been "densified"), this ends
up being the only action we take. Note that if kappa is unit,
the data is simply copied (rather than scaled by one). */ \
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
( \
diagoffc, \
diagc, \
uploc, \
transc, \
m, \
n, \
kappa_cast, \
c_cast, rs_c, cs_c, \
p_cast, rs_p, cs_p, \
cntx, \
NULL \
); \
\
/* If uploc is upper or lower, then the structure of c is necessarily
non-dense (ie: Hermitian, symmetric, or triangular, where part of the
matrix is unstored). In these cases, we want to fill in the unstored
part of the matrix. How this is done depends on the structure of c. */ \
if ( bli_is_upper_or_lower( uploc ) ) \
{ \
/* The Hermitian and symmetric cases are almost identical, so we
handle them in one conditional block. */ \
if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \
{ \
/* First we must reflect the region referenced to the opposite
side of the diagonal. */ \
c_cast = c_cast + diagoffc * ( doff_t )cs_c + \
-diagoffc * ( doff_t )rs_c; \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_trans( &transc ); \
if ( bli_is_upper( uploc ) ) diagoffc += 1; \
else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \
\
/* If c is Hermitian, we need to apply a conjugation when
copying the region opposite the diagonal. */ \
if ( bli_is_hermitian( strucc ) ) \
transc = bli_trans_toggled_conj( transc ); \
\
/* Copy the data from the region opposite the diagonal of c
(as specified by the original value of diagoffc). Notice
that we use a diag parameter of non-unit since we can
assume nothing about the neighboring off-diagonal. */ \
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
( \
diagoffc, \
BLIS_NONUNIT_DIAG, \
uploc, \
transc, \
m, \
n, \
kappa_cast, \
c_cast, rs_c, cs_c, \
p_cast, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
else /* if ( bli_is_triangular( strucc ) ) */ \
{ \
doff_t diagoffp = diagoffc; \
uplo_t uplop = uploc; \
\
/* For this step we need the uplo and diagonal offset of p, which
we can derive from the parameters given. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_negate_diag_offset( &diagoffp ); \
bli_toggle_uplo( &uplop ); \
} \
\
/* For triangular matrices, we wish to reference the region
strictly opposite the diagonal of C. This amounts to
toggling uploc and then shifting the diagonal offset to
shrink the stored region (by one diagonal). */ \
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
\
/* Set the region opposite the diagonal of p to zero. */ \
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffp, \
BLIS_NONUNIT_DIAG, \
uplop, \
m, \
n, \
zero, \
p_cast, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
} \
\
/* The packed memory region was acquired/allocated with "aligned"
dimensions (ie: dimensions that were possibly inflated up to a
multiple). When these dimension are inflated, it creates empty
regions along the bottom and/or right edges of the matrix. If
eithe region exists, we set them to zero. This simplifies the
register level micro kernel in that it does not need to support
different register blockings for the edge cases. */ \
if ( m != m_max ) \
{ \
ctype* p_edge = p_cast + (m )*rs_p; \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_max - m, \
n_max, \
zero, \
p_edge, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
\
if ( n != n_max ) \
{ \
ctype* p_edge = p_cast + (n )*cs_p; \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_max, \
n_max - n, \
zero, \
p_edge, rs_p, cs_p, \
cntx, \
NULL \
); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_unb_var1 )

View File

@@ -1,66 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_unb_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
);
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
void* kappa, \
void* c, inc_t rs_c, inc_t cs_c, \
void* p, inc_t rs_p, inc_t cs_p, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_unb_var1 )

View File

@@ -36,8 +36,6 @@
#include "bli_unpackm_check.h"
#include "bli_unpackm_int.h"
#include "bli_unpackm_unb_var1.h"
#include "bli_unpackm_blk_var1.h"
#include "bli_unpackm_cxk.h"

View File

@@ -1,131 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T unpackm_fp
typedef void (*FUNCPTR_T)(
doff_t diagoffp,
uplo_t uplop,
trans_t transp,
dim_t m,
dim_t n,
void* p, inc_t rs_p, inc_t cs_p,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,unpackm_unb_var1);
void bli_unpackm_unb_var1
(
obj_t* p,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_pc = bli_obj_dt( p );
doff_t diagoffp = bli_obj_diag_offset( p );
uplo_t uplop = bli_obj_uplo( p );
trans_t transc = bli_obj_onlytrans_status( c );
dim_t m_c = bli_obj_length( c );
dim_t n_c = bli_obj_width( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_pc];
// Invoke the function.
f( diagoffp,
uplop,
transc,
m_c,
n_c,
buf_p, rs_p, cs_p,
buf_c, rs_c, cs_c,
cntx
);
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, varname ) \
\
void PASTEMAC(ch,varname)( \
doff_t diagoffp, \
uplo_t uplop, \
trans_t transp, \
dim_t m, \
dim_t n, \
void* p, inc_t rs_p, inc_t cs_p, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
ctype* p_cast = p; \
ctype* c_cast = c; \
\
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
( \
diagoffp,\
BLIS_NONUNIT_DIAG, \
uplop, \
transp, \
m, \
n, \
p_cast, rs_p, cs_p, \
c_cast, rs_c, cs_c, \
cntx, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC( unpackm, unpackm_unb_var1 )

View File

@@ -1,60 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackm_unb_var1
(
obj_t* p,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
);
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffp, \
uplo_t uplop, \
trans_t transp, \
dim_t m, \
dim_t n, \
void* p, inc_t rs_p, inc_t cs_p, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( unpackm_unb_var1 )

View File

@@ -35,6 +35,8 @@
#include "bli_l3_cntl.h"
#include "bli_l3_check.h"
#include "bli_l3_int.h"
#include "bli_l3_packab.h"
// Define function types.
//#include "bli_l3_ft_ex.h"
@@ -45,7 +47,6 @@
#include "bli_l3_blocksize.h"
#include "bli_l3_direct.h"
#include "bli_l3_prune.h"
#include "bli_l3_packm.h"
#include "bli_l3_schema.h"
// Prototype object APIs (basic and expert).

View File

@@ -53,7 +53,7 @@ void bli_gemm_check
// Check object structure.
// NOTE: Can't perform these checks as long as bli_gemm_check() is called
// from bli_gemm_int(), which is in the execution path for structured
// from bli_l3_int(), which is in the execution path for structured
// level-3 operations such as hemm.
//e_val = bli_check_general_object( a );
@@ -109,7 +109,7 @@ void bli_hemm_check
}
void bli_herk_check
(
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
@@ -197,7 +197,7 @@ void bli_symm_check
}
void bli_syrk_check
(
(
obj_t* alpha,
obj_t* a,
obj_t* beta,

View File

@@ -34,7 +34,7 @@
#include "blis.h"
void bli_trsm_int
void bli_l3_int
(
obj_t* alpha,
obj_t* a,
@@ -47,10 +47,9 @@ void bli_trsm_int
thrinfo_t* thread
)
{
obj_t a_local;
obj_t b_local;
obj_t c_local;
trsm_var_oft f;
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Return early if the current control tree node is NULL.
if ( bli_cntl_is_null( cntl ) ) return;
@@ -60,72 +59,82 @@ void bli_trsm_int
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) ) return;
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// Alias A and B in case we need to update attached scalars.
// If A or B is marked as being filled with zeros, scale C by beta and
// return early.
if ( bli_obj_is_zeros( a ) ||
bli_obj_is_zeros( b ) )
{
// This should never execute.
bli_abort();
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// Alias A, B, and C in case we need to update attached scalars.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
// Alias C in case we need to induce a transposition.
bli_obj_alias_to( c, &c_local );
// Ensure that a valid packing function is set on A and B.
if ( !bli_obj_pack_fn( &a_local ) )
bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local );
if ( !bli_obj_pack_fn( &b_local ) )
bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local );
// If we are about to call a leaf-level implementation, and matrix C
// still needs a transposition, then we must induce one by swapping the
// strides and dimensions. Note that this transposition would normally
// be handled explicitly in the packing of C, but if C is not being
// packed, this is our last chance to handle the transposition.
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
//if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
if ( bli_obj_has_trans( c ) )
{
bli_obj_induce_trans( &c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
}
// If beta is non-unit, apply it to the scalar attached to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B, unless it happens to be triangular.
if ( bli_obj_root_is_triangular( b ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
// If alpha is non-unit, typecast and apply it to the scalar
// attached to B (the non-triangular matrix).
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
bli_obj_scalar_apply_scalar( alpha, &a_local );
}
else // if ( bli_obj_root_is_triangular( b ) )
{
// If alpha is non-unit, typecast and apply it to the scalar
// attached to A (the non-triangular matrix).
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &a_local );
}
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// FGVZ->TMS: Is this barrier still needed?
bli_thread_barrier( thread );
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( rntm, cntl, thread );
// Extract the function pointer from the current control tree node.
f = bli_cntl_var_func( cntl );
l3_var_oft f = bli_cntl_var_func( cntl );
// Invoke the variant.
f

View File

@@ -32,7 +32,7 @@
*/
void bli_gemm_int
void bli_l3_int
(
obj_t* alpha,
obj_t* a,

View File

@@ -54,24 +54,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
thrinfo_t* thread \
);
GENTDEF( gemm )
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_var_oft)) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENTDEF( trsm )
GENTDEF( l3 )

View File

@@ -34,7 +34,7 @@
#include "blis.h"
void bli_gemm_packa
void bli_l3_packa
(
obj_t* a,
obj_t* b,
@@ -45,12 +45,19 @@ void bli_gemm_packa
thrinfo_t* thread
)
{
obj_t a_pack;
obj_t a_local, a_pack;
bli_obj_alias_to( a, &a_local );
if ( bli_obj_has_trans( a ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
// Pack matrix A according to the control tree node.
bli_l3_packm
bli_packm_int
(
a,
&a_local,
&a_pack,
cntx,
rntm,
@@ -59,7 +66,7 @@ void bli_gemm_packa
);
// Proceed with execution using packed matrix A.
bli_gemm_int
bli_l3_int
(
&BLIS_ONE,
&a_pack,
@@ -75,7 +82,7 @@ void bli_gemm_packa
// -----------------------------------------------------------------------------
void bli_gemm_packb
void bli_l3_packb
(
obj_t* a,
obj_t* b,
@@ -86,25 +93,39 @@ void bli_gemm_packb
thrinfo_t* thread
)
{
obj_t b_pack;
obj_t bt_local, bt_pack;
// We always pass B^T to bli_l3_packm.
bli_obj_alias_to( b, &bt_local );
if ( bli_obj_has_trans( b ) )
{
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local );
}
else
{
bli_obj_induce_trans( &bt_local );
}
// Pack matrix B according to the control tree node.
bli_l3_packm
bli_packm_int
(
b,
&b_pack,
&bt_local,
&bt_pack,
cntx,
rntm,
cntl,
thread
);
// Transpose packed object back to B.
bli_obj_induce_trans( &bt_pack );
// Proceed with execution using packed matrix B.
bli_gemm_int
bli_l3_int
(
&BLIS_ONE,
a,
&b_pack,
&bt_pack,
&BLIS_ONE,
c,
cntx,

View File

@@ -32,12 +32,21 @@
*/
void bli_trsm_int
void bli_l3_packa
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
void bli_l3_packb
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,

View File

@@ -1,187 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_packm
(
obj_t* x,
obj_t* x_pack,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
packbuf_t pack_buf_type;
mem_t* cntl_mem_p;
siz_t size_needed;
// FGVZ: Not sure why we need this barrier, but we do.
bli_thread_barrier( thread );
// Every thread initializes x_pack and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
// entry in the control tree node).
size_needed
=
bli_packm_init
(
x,
x_pack,
cntx,
cntl
);
// If zero was returned, no memory needs to be allocated and so we can
// return early.
if ( size_needed == 0 ) return;
// Query the pack buffer type from the control tree node.
pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
// Query the address of the mem_t entry within the control tree node.
cntl_mem_p = bli_cntl_pack_mem( cntl );
// Check the mem_t field in the control tree. If it is unallocated, then
// we need to acquire a block from the memory broker and broadcast it to
// all threads in the chief's thread group.
if ( bli_mem_is_unalloc( cntl_mem_p ) )
{
mem_t* local_mem_p;
mem_t local_mem_s;
if ( bli_thread_am_ochief( thread ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_packm(): acquiring mem pool block\n" );
#endif
// The chief thread acquires a block from the memory broker
// and saves the associated mem_t entry to local_mem_s.
bli_pba_acquire_m
(
rntm,
size_needed,
pack_buf_type,
&local_mem_s
);
}
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the contents of the chief thread's local mem_t entry to the
// mem_t field in this thread's control tree node.
*cntl_mem_p = *local_mem_p;
}
else // ( bli_mem_is_alloc( cntl_mem_p ) )
{
mem_t* local_mem_p;
mem_t local_mem_s;
// If the mem_t entry in the control tree does NOT contain a NULL
// buffer, then a block has already been acquired from the memory
// broker and cached in the control tree.
// As a sanity check, we should make sure that the mem_t object isn't
// associated with a block that is too small compared to the size of
// the packed matrix buffer that is needed, according to the return
// value from packm_init().
siz_t cntl_mem_size = bli_mem_size( cntl_mem_p );
if ( cntl_mem_size < size_needed )
{
if ( bli_thread_am_ochief( thread ) )
{
// The chief thread releases the existing block associated with
// the mem_t entry in the control tree, and then re-acquires a
// new block, saving the associated mem_t entry to local_mem_s.
bli_pba_release
(
rntm,
cntl_mem_p
);
bli_pba_acquire_m
(
rntm,
size_needed,
pack_buf_type,
&local_mem_s
);
}
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
*cntl_mem_p = *local_mem_p;
}
else
{
// If the mem_t entry is already allocated and sufficiently large,
// then we use it as-is. No action is needed, because all threads
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.
bli_thread_barrier( thread );
}
}
// Update the buffer address in x_pack to point to the buffer associated
// with the mem_t entry acquired from the memory broker (now cached in
// the control tree node).
void* buf = bli_mem_buffer( cntl_mem_p );
bli_obj_set_buffer( buf, x_pack );
// Pack the contents of object x to object x_pack.
bli_packm_int
(
x,
x_pack,
cntx,
cntl,
thread
);
// Barrier so that packing is done before computation.
bli_thread_barrier( thread );
}

View File

@@ -34,7 +34,6 @@
#include "bli_gemm_cntl.h"
#include "bli_gemm_front.h"
#include "bli_gemm_int.h"
#include "bli_gemm_var.h"

View File

@@ -77,7 +77,7 @@ void bli_gemm_blk_var1
i, b_alg, c, &c1 );
// Perform gemm subproblem.
bli_gemm_int
bli_l3_int
(
&BLIS_ONE,
&a1,

View File

@@ -77,7 +77,7 @@ void bli_gemm_blk_var2
i, b_alg, c, &c1 );
// Perform gemm subproblem.
bli_gemm_int
bli_l3_int
(
&BLIS_ONE,
a,

View File

@@ -71,7 +71,7 @@ void bli_gemm_blk_var3
i, b_alg, b, &b1 );
// Perform gemm subproblem.
bli_gemm_int
bli_l3_int
(
&BLIS_ONE,
&a1,

View File

@@ -57,8 +57,6 @@ cntl_t* bli_gemmbp_cntl_create
)
{
void_fp macro_kernel_fp;
void_fp packa_fp;
void_fp packb_fp;
// Use the function pointers to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
@@ -67,9 +65,6 @@ cntl_t* bli_gemmbp_cntl_create
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
else /* should never execute */ macro_kernel_fp = NULL;
packa_fp = bli_packm_blk_var1;
packb_fp = bli_packm_blk_var1;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
(
@@ -93,8 +88,7 @@ cntl_t* bli_gemmbp_cntl_create
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_gemm_packa, // pack the left-hand operand
packa_fp,
bli_l3_packa, // pack the left-hand operand
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
@@ -119,10 +113,9 @@ cntl_t* bli_gemmbp_cntl_create
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_gemm_packb, // pack the right-hand operand
packb_fp,
BLIS_KR,
bli_l3_packb, // pack the right-hand operand
BLIS_NR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
@@ -194,8 +187,8 @@ cntl_t* bli_gemmpb_cntl_create
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_KR,
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?

View File

@@ -87,13 +87,14 @@ void bli_gemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
#ifdef BLIS_ENABLE_GEMM_MD
// Don't perform the following optimization for ccr or crc cases, as
// those cases are sensitive to the ukernel storage preference (ie:
// transposing the operation would break them).
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
#endif
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
@@ -251,7 +252,7 @@ void bli_gemm_front
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,

View File

@@ -1,127 +0,0 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a_local;
obj_t b_local;
obj_t c_local;
gemm_var_oft f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// If A or B is marked as being filled with zeros, scale C by beta and
// return early.
if ( bli_obj_is_zeros( a ) ||
bli_obj_is_zeros( b ) )
{
// This should never execute.
bli_abort();
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// Alias A, B, and C in case we need to update attached scalars.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B.
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_scalar_apply_scalar( beta, &c_local );
}
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( rntm, cntl, thread );
// Extract the function pointer from the current control tree node.
f = bli_cntl_var_func( cntl );
// Invoke the variant.
f
(
&a_local,
&b_local,
&c_local,
cntx,
rntm,
cntl,
thread
);
}

View File

@@ -55,11 +55,8 @@ void PASTEMAC0(opname) \
GENPROT( gemm_blk_var1 )
GENPROT( gemm_blk_var2 )
GENPROT( gemm_blk_var3 )
GENPROT( gemm_packa )
GENPROT( gemm_packb )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )

View File

@@ -73,7 +73,14 @@ void bli_gemmt_front
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
bli_obj_set_as_root( &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
@@ -107,7 +114,7 @@ void bli_gemmt_front
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_GEMMT, // operation family id
alpha,
&a_local,

View File

@@ -35,7 +35,7 @@
#include "blis.h"
static gemm_var_oft vars[2] =
static l3_var_oft vars[2] =
{
bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
};
@@ -51,8 +51,8 @@ void bli_gemmt_x_ker_var2
thrinfo_t* thread
)
{
dim_t uplo;
gemm_var_oft f;
dim_t uplo;
l3_var_oft f;
// Set a bool based on the uplo field of C's root object.
if ( bli_obj_root_is_lower( c ) ) uplo = 0;

View File

@@ -65,6 +65,14 @@ void bli_hemm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
#ifdef BLIS_DISABLE_HEMM_RIGHT
// NOTE: This case casts right-side hemm in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
@@ -129,13 +137,6 @@ void bli_hemm_front
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -152,7 +153,7 @@ void bli_hemm_front
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,

View File

@@ -65,6 +65,14 @@ void bli_symm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
#ifdef BLIS_DISABLE_SYMM_RIGHT
// NOTE: This case casts right-side symm in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
@@ -128,13 +136,6 @@ void bli_symm_front
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -151,7 +152,7 @@ void bli_symm_front
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,

View File

@@ -64,6 +64,14 @@ void bli_trmm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( b, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
@@ -147,13 +155,6 @@ void bli_trmm_front
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -170,7 +171,7 @@ void bli_trmm_front
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,

View File

@@ -35,7 +35,7 @@
#include "blis.h"
static gemm_var_oft vars[2][2] =
static l3_var_oft vars[2][2] =
{
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
@@ -52,9 +52,9 @@ void bli_trmm_xx_ker_var2
thrinfo_t* thread
)
{
dim_t side;
dim_t uplo;
gemm_var_oft f;
dim_t side;
dim_t uplo;
l3_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular

View File

@@ -65,6 +65,14 @@ void bli_trmm3_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
@@ -139,13 +147,6 @@ void bli_trmm3_front
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -162,7 +163,7 @@ void bli_trmm3_front
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_gemm_int,
bli_l3_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,

View File

@@ -34,7 +34,5 @@
#include "bli_trsm_cntl.h"
#include "bli_trsm_front.h"
#include "bli_trsm_int.h"
#include "bli_trsm_var.h"

View File

@@ -58,7 +58,7 @@ void bli_trsm_blk_var1
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Isolate the diagonal block A11 and its corresponding row panel C1.
const dim_t kc = bli_obj_width( a );
const dim_t kc = bli_obj_width_after_trans( a );
obj_t a11, c1;
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
0, kc, a, &a11 );
@@ -96,7 +96,7 @@ void bli_trsm_blk_var1
#endif
// Perform trsm subproblem.
bli_trsm_int
bli_l3_int
(
&BLIS_ONE,
&a11_1,
@@ -169,7 +169,7 @@ void bli_trsm_blk_var1
// Perform gemm subproblem. (Note that we use the same backend
// function as before, since we're calling the same macrokernel.)
bli_trsm_int
bli_l3_int
(
&BLIS_ONE,
&a11,

View File

@@ -60,7 +60,7 @@ void bli_trsm_blk_var2
bli_thread_range_ndim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end
&my_start, &my_end
);
// Partition along the n dimension.
@@ -77,7 +77,7 @@ void bli_trsm_blk_var2
i, b_alg, c, &c1 );
// Perform trsm subproblem.
bli_trsm_int
bli_l3_int
(
&BLIS_ONE,
a,

View File

@@ -71,7 +71,7 @@ void bli_trsm_blk_var3
i, b_alg, b, &b1 );
// Perform trsm subproblem.
bli_trsm_int
bli_l3_int
(
&BLIS_ONE,
&a1,

View File

@@ -57,16 +57,11 @@ cntl_t* bli_trsm_l_cntl_create
)
{
void_fp macro_kernel_p;
void_fp packa_fp;
void_fp packb_fp;
// Use the function pointer to the macrokernels that use slab
// assignment of micropanels to threads in the jr and ir loops.
macro_kernel_p = bli_trsm_xx_ker_var2;
packa_fp = bli_packm_blk_var1;
packb_fp = bli_packm_blk_var1;
const opid_t family = BLIS_TRSM;
//
@@ -95,8 +90,7 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_trsm_packa, // trsm operation's packm function for A.
packa_fp,
bli_l3_packa, // trsm operation's packm function for A.
BLIS_MR,
BLIS_MR,
FALSE, // do NOT invert diagonal
@@ -133,8 +127,7 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_trsm_packa, // trsm operation's packm function for A.
packa_fp,
bli_l3_packa, // trsm operation's packm function for A.
BLIS_MR,
BLIS_MR,
#ifdef BLIS_ENABLE_TRSM_PREINVERSION
@@ -171,10 +164,9 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_trsm_packb,
packb_fp,
BLIS_MR,
bli_l3_packb,
BLIS_NR,
BLIS_MR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
@@ -208,7 +200,7 @@ cntl_t* bli_trsm_l_cntl_create
cntl_t* bli_trsm_r_cntl_create
(
rntm_t* rntm,
rntm_t* rntm,
pack_t schema_a,
pack_t schema_b
)
@@ -216,9 +208,6 @@ cntl_t* bli_trsm_r_cntl_create
// NOTE: trsm macrokernels are presently disabled for right-side execution.
void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
void_fp packa_fp = bli_packm_blk_var1;
void_fp packb_fp = bli_packm_blk_var1;
const opid_t family = BLIS_TRSM;
// Create two nodes for the macro-kernel.
@@ -244,8 +233,7 @@ cntl_t* bli_trsm_r_cntl_create
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_trsm_packa,
packa_fp,
bli_l3_packa,
BLIS_NR,
BLIS_MR,
FALSE, // do NOT invert diagonal
@@ -270,8 +258,7 @@ cntl_t* bli_trsm_r_cntl_create
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_trsm_packb,
packb_fp,
bli_l3_packb,
BLIS_MR,
BLIS_MR,
TRUE, // do NOT invert diagonal

View File

@@ -71,6 +71,14 @@ void bli_trsm_front
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( b, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
@@ -121,13 +129,6 @@ void bli_trsm_front
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Set each alias as the root object.
// NOTE: We MUST wait until we are done potentially swapping the objects
// before setting the root fields!
bli_obj_set_as_root( &a_local );
bli_obj_set_as_root( &b_local );
bli_obj_set_as_root( &c_local );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
@@ -144,7 +145,7 @@ void bli_trsm_front
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_trsm_int,
bli_l3_int,
BLIS_TRSM, // operation family id
alpha,
&a_local,

View File

@@ -55,8 +55,6 @@ void PASTEMAC0(opname) \
GENPROT( trsm_blk_var1 )
GENPROT( trsm_blk_var2 )
GENPROT( trsm_blk_var3 )
GENPROT( trsm_packa )
GENPROT( trsm_packb )
GENPROT( trsm_xx_ker_var2 )

View File

@@ -35,7 +35,7 @@
#include "blis.h"
static trsm_var_oft vars[2][2] =
static l3_var_oft vars[2][2] =
{
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
@@ -52,9 +52,9 @@ void bli_trsm_xx_ker_var2
thrinfo_t* thread
)
{
dim_t side;
dim_t uplo;
trsm_var_oft f;
dim_t side;
dim_t uplo;
l3_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular

View File

@@ -118,6 +118,11 @@ void bli_obj_create_without_buffer
bli_obj_set_offs( 0, 0, obj );
bli_obj_set_diag_offset( 0, obj );
bli_obj_set_pack_fn( NULL, obj );
bli_obj_set_pack_params( NULL, obj );
bli_obj_set_ker_fn( NULL, obj );
bli_obj_set_ker_params( NULL, obj );
// Set the internal scalar to 1.0.
bli_obj_set_scalar_dt( dt, obj );
s = bli_obj_internal_scalar_buffer( obj );
@@ -356,7 +361,7 @@ void bli_obj_free
buf_a = bli_obj_buffer_at_off( a );
bli_zzsets( 0.0, 0.0, value );
bli_zzsets( 0.0, 0.0, value );
if ( bli_obj_is_float( a ) )
{
@@ -500,7 +505,7 @@ void bli_adjust_strides
// Set the column stride to indicate that this is a column vector
// stored in column-major order. This is done for legacy reasons,
// because we at one time we had to satisify the error checking
// in the underlying BLAS library, which expects the leading
// in the underlying BLAS library, which expects the leading
// dimension to be set to at least m, even if it will never be
// used for indexing since it is a vector and thus only has one
// column of data.

View File

@@ -282,17 +282,6 @@ void bli_pba_acquire_v
#endif
void bli_pba_rntm_set_pba
(
rntm_t* rntm
)
{
pba_t* pba = bli_pba_query();
bli_rntm_set_pba( pba, rntm );
}
siz_t bli_pba_pool_size
(
pba_t* pba,

View File

@@ -119,7 +119,7 @@ BLIS_INLINE void bli_pba_unlock( pba_t* pba )
// -----------------------------------------------------------------------------
pba_t* bli_pba_query( void );
BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
void bli_pba_init
(
@@ -144,10 +144,15 @@ void bli_pba_release
mem_t* mem
);
void bli_pba_rntm_set_pba
BLIS_INLINE void bli_pba_rntm_set_pba
(
rntm_t* rntm
);
)
{
pba_t* pba = bli_pba_query();
bli_rntm_set_pba( pba, rntm );
}
siz_t bli_pba_pool_size
(

View File

@@ -76,24 +76,39 @@ void* bli_sba_acquire
// Query the small block pool from the rntm.
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
// Query the block_size of the pool_t so that we can request the exact
// size present.
const siz_t block_size = bli_pool_block_size( pool );
// Sanity check: Make sure the requested size is no larger than the
// block_size field of the pool.
if ( block_size < req_size )
// We don't expect NULL sba_pool pointers in the normal course of BLIS
// operation. However, there are rare instances where it is convenient
// to support use of bli_sba_acquire() without having to pass in a valid
// sba pool data structure. The case that inspired this branch was the
// gemm_ukr and related test modules in the BLIS testsuite. (There, it
// is convenient to not have to checkout an array_t from the sba, and it
// does no harm since the malloc() happens outside of the region that
// would be timed.)
if ( pool == NULL )
{
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
( int )block_size, ( int )req_size );
bli_abort();
block = bli_malloc_intl( req_size, &r_val );
}
else
{
// Query the block_size of the pool_t so that we can request the exact
// size present.
const siz_t block_size = bli_pool_block_size( pool );
// Check out a block using the block_size queried above.
bli_pool_checkout_block( block_size, &pblk, pool );
// Sanity check: Make sure the requested size is no larger than the
// block_size field of the pool.
if ( block_size < req_size )
{
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
( int )block_size, ( int )req_size );
bli_abort();
}
// The block address is stored within the pblk_t.
block = bli_pblk_buf( &pblk );
// Check out a block using the block_size queried above.
bli_pool_checkout_block( block_size, &pblk, pool );
// The block address is stored within the pblk_t.
block = bli_pblk_buf( &pblk );
}
}
#else
@@ -123,21 +138,28 @@ void bli_sba_release
// Query the small block pool from the rntm.
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
// Query the block_size field from the pool. This is not super-important
// for this particular application of the pool_t (that is, the "leaf"
// component of the sba), but it seems like good housekeeping to maintain
// the block_size field of the pblk_t in case its ever needed/read.
const siz_t block_size = bli_pool_block_size( pool );
if ( pool == NULL )
{
bli_free_intl( block );
}
else
{
// Query the block_size field from the pool. This is not super-important
// for this particular application of the pool_t (that is, the "leaf"
// component of the sba), but it seems like good housekeeping to maintain
// the block_size field of the pblk_t in case its ever needed/read.
const siz_t block_size = bli_pool_block_size( pool );
// Embed the block's memory address into a pblk_t, along with the
// block_size queried from the pool.
bli_pblk_set_buf( block, &pblk );
bli_pblk_set_block_size( block_size, &pblk );
// Embed the block's memory address into a pblk_t, along with the
// block_size queried from the pool.
bli_pblk_set_buf( block, &pblk );
bli_pblk_set_block_size( block_size, &pblk );
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
// a local variable since its contents are copied into the pool's internal
// data structure--an array of pblk_t.)
bli_pool_checkin_block( &pblk, pool );
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
// a local variable since its contents are copied into the pool's internal
// data structure--an array of pblk_t.)
bli_pool_checkin_block( &pblk, pool );
}
}
#else

View File

@@ -1189,52 +1189,48 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
// -- User-provided information macros --
// User data query
BLIS_INLINE void* bli_obj_user_data( obj_t* obj )
{
return obj->user_data;
}
// User data modification
BLIS_INLINE void bli_obj_set_user_data( void* data, obj_t* obj )
{
obj->user_data = data;
}
// Function pointer query
BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
{
return obj->pack;
return obj->pack_fn;
}
BLIS_INLINE void* bli_obj_pack_params( obj_t* obj )
{
return obj->pack_params;
}
BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj )
{
return obj->ker;
return obj->ker_fn;
}
BLIS_INLINE obj_ukr_fn_t bli_obj_ukr_fn( obj_t* obj )
BLIS_INLINE void* bli_obj_ker_params( obj_t* obj )
{
return obj->ukr;
return obj->ker_params;
}
// Function pointer modification
BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack, obj_t* obj )
BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj )
{
obj->pack = pack;
obj->pack_fn = pack_fn;
}
BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker, obj_t* obj )
BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj )
{
obj->ker = ker;
obj->pack_params = params;
}
BLIS_INLINE void bli_obj_set_ukr_fn( obj_ukr_fn_t ukr, obj_t* obj )
BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj )
{
obj->ukr = ukr;
obj->ker_fn = ker_fn;
}
BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj )
{
obj->ker_params = params;
}
@@ -1357,6 +1353,18 @@ BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
);
}
// Adjust the pointer based on current offsets, zero the offsets, and then
// set the current object as the root. For obj_t's with at least one non-zero
// offset, this effectively makes the obj_t "forget" that it was ever a view
// into a larger matrix.
BLIS_INLINE void bli_obj_reset_origin( obj_t* obj )
{
bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj );
bli_obj_set_offs( 0, 0, obj );
bli_obj_set_as_root( obj );
}
// Make a full alias (shallow copy).
BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b )
@@ -1482,7 +1490,13 @@ BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t*
BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b )
{
bool a_root_is_self = ( bli_obj_root( a ) == a );
bool b_root_is_self = ( bli_obj_root( b ) == b );
obj_t t = *b; *b = *a; *a = t;
if ( a_root_is_self ) bli_obj_set_as_root( b );
if ( b_root_is_self ) bli_obj_set_as_root( a );
}
// Swap object pack schemas.

View File

@@ -1174,12 +1174,11 @@ struct thrinfo_s;
typedef void (*obj_pack_fn_t)
(
mdim_t mat,
mem_t* mem,
struct obj_s* a,
struct obj_s* ap,
struct cntx_s* cntx,
struct rntm_s* rntm,
struct cntl_s* cntl,
struct thrinfo_s* thread
);
@@ -1190,23 +1189,10 @@ typedef void (*obj_ker_fn_t)
struct obj_s* c,
struct cntx_s* cntx,
struct rntm_s* rntm,
struct cntl_s* cntl,
struct thrinfo_s* thread
);
typedef void (*obj_ukr_fn_t)
(
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
struct cntx_s* restrict cntx
);
typedef struct obj_s
{
// Basic fields
@@ -1237,13 +1223,11 @@ typedef struct obj_s
dim_t m_panel; // m dimension of a "full" panel
dim_t n_panel; // n dimension of a "full" panel
// User data pointer
void* user_data;
// Function pointers
obj_pack_fn_t pack;
obj_ker_fn_t ker;
obj_ukr_fn_t ukr;
// User-customizable fields
obj_pack_fn_t pack_fn;
void* pack_params;
obj_ker_fn_t ker_fn;
void* ker_params;
} obj_t;
@@ -1258,70 +1242,68 @@ typedef struct obj_s
#define BLIS_OBJECT_INITIALIZER \
{ \
.root = NULL, \
.root = NULL, \
\
.off = { 0, 0 }, \
.dim = { 0, 0 }, \
.diag_off = 0, \
.off = { 0, 0 }, \
.dim = { 0, 0 }, \
.diag_off = 0, \
\
.info = 0x0 | BLIS_BITVAL_DENSE | \
BLIS_BITVAL_GENERAL, \
.info2 = 0x0, \
.elem_size = sizeof( float ), /* this is changed later. */ \
.info = 0x0 | BLIS_BITVAL_DENSE | \
BLIS_BITVAL_GENERAL, \
.info2 = 0x0, \
.elem_size = sizeof( float ), /* this is changed later. */ \
\
.buffer = NULL, \
.rs = 0, \
.cs = 0, \
.is = 1, \
.buffer = NULL, \
.rs = 0, \
.cs = 0, \
.is = 1, \
\
.scalar = { 0.0, 0.0 }, \
.scalar = { 0.0, 0.0 }, \
\
.m_padded = 0, \
.n_padded = 0, \
.ps = 0, \
.pd = 0, \
.m_panel = 0, \
.n_panel = 0, \
.m_padded = 0, \
.n_padded = 0, \
.ps = 0, \
.pd = 0, \
.m_panel = 0, \
.n_panel = 0, \
\
.user_data = NULL, \
\
.pack = NULL, \
.ker = NULL, \
.ukr = NULL \
.pack_fn = NULL, \
.pack_params = NULL, \
.ker_fn = NULL, \
.ker_params = NULL \
}
#define BLIS_OBJECT_INITIALIZER_1X1 \
{ \
.root = NULL, \
.root = NULL, \
\
.off = { 0, 0 }, \
.dim = { 1, 1 }, \
.diag_off = 0, \
.off = { 0, 0 }, \
.dim = { 1, 1 }, \
.diag_off = 0, \
\
.info = 0x0 | BLIS_BITVAL_DENSE | \
BLIS_BITVAL_GENERAL, \
.info2 = 0x0, \
.elem_size = sizeof( float ), /* this is changed later. */ \
.info = 0x0 | BLIS_BITVAL_DENSE | \
BLIS_BITVAL_GENERAL, \
.info2 = 0x0, \
.elem_size = sizeof( float ), /* this is changed later. */ \
\
.buffer = NULL, \
.rs = 0, \
.cs = 0, \
.is = 1, \
.buffer = NULL, \
.rs = 0, \
.cs = 0, \
.is = 1, \
\
.scalar = { 0.0, 0.0 }, \
.scalar = { 0.0, 0.0 }, \
\
.m_padded = 0, \
.n_padded = 0, \
.ps = 0, \
.pd = 0, \
.m_panel = 0, \
.n_panel = 0, \
.m_padded = 0, \
.n_padded = 0, \
.ps = 0, \
.pd = 0, \
.m_panel = 0, \
.n_panel = 0, \
\
.user_data = NULL, \
\
.pack = NULL, \
.ker = NULL, \
.ukr = NULL \
.pack_fn = NULL, \
.pack_params = NULL, \
.ker_fn = NULL, \
.ker_params = NULL \
}
// Define these macros here since they must be updated if contents of
@@ -1329,77 +1311,75 @@ typedef struct obj_s
BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
{
b->root = a->root;
b->root = a->root;
b->off[0] = a->off[0];
b->off[1] = a->off[1];
b->dim[0] = a->dim[0];
b->dim[1] = a->dim[1];
b->diag_off = a->diag_off;
b->off[0] = a->off[0];
b->off[1] = a->off[1];
b->dim[0] = a->dim[0];
b->dim[1] = a->dim[1];
b->diag_off = a->diag_off;
b->info = a->info;
b->info2 = a->info2;
b->elem_size = a->elem_size;
b->info = a->info;
b->info2 = a->info2;
b->elem_size = a->elem_size;
b->buffer = a->buffer;
b->rs = a->rs;
b->cs = a->cs;
b->is = a->is;
b->buffer = a->buffer;
b->rs = a->rs;
b->cs = a->cs;
b->is = a->is;
b->scalar = a->scalar;
b->scalar = a->scalar;
//b->pack_mem = a->pack_mem;
b->m_padded = a->m_padded;
b->n_padded = a->n_padded;
b->ps = a->ps;
b->pd = a->pd;
b->m_panel = a->m_panel;
b->n_panel = a->n_panel;
//b->pack_mem = a->pack_mem;
b->m_padded = a->m_padded;
b->n_padded = a->n_padded;
b->ps = a->ps;
b->pd = a->pd;
b->m_panel = a->m_panel;
b->n_panel = a->n_panel;
b->user_data = a->user_data;
b->pack = a->pack;
b->ker = a->ker;
b->ukr = a->ukr;
b->pack_fn = a->pack_fn;
b->pack_params = a->pack_params;
b->ker_fn = a->ker_fn;
b->ker_params = a->ker_params;
}
BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b )
{
b->root = a->root;
b->root = a->root;
b->off[0] = a->off[0];
b->off[1] = a->off[1];
b->off[0] = a->off[0];
b->off[1] = a->off[1];
// Avoid copying m and n since they will be overwritten.
//b->dim[0] = a->dim[0];
//b->dim[1] = a->dim[1];
b->diag_off = a->diag_off;
//b->dim[0] = a->dim[0];
//b->dim[1] = a->dim[1];
b->diag_off = a->diag_off;
b->info = a->info;
b->info2 = a->info2;
b->elem_size = a->elem_size;
b->info = a->info;
b->info2 = a->info2;
b->elem_size = a->elem_size;
b->buffer = a->buffer;
b->rs = a->rs;
b->cs = a->cs;
b->is = a->is;
b->buffer = a->buffer;
b->rs = a->rs;
b->cs = a->cs;
b->is = a->is;
b->scalar = a->scalar;
b->scalar = a->scalar;
// Avoid copying pack_mem entry.
// FGVZ: You should probably make sure this is right.
//b->pack_mem = a->pack_mem;
b->m_padded = a->m_padded;
b->n_padded = a->n_padded;
b->ps = a->ps;
b->pd = a->pd;
b->m_panel = a->m_panel;
b->n_panel = a->n_panel;
//b->pack_mem = a->pack_mem;
b->m_padded = a->m_padded;
b->n_padded = a->n_padded;
b->ps = a->ps;
b->pd = a->pd;
b->m_panel = a->m_panel;
b->n_panel = a->n_panel;
b->user_data = a->user_data;
b->pack = a->pack;
b->ker = a->ker;
b->ukr = a->ukr;
b->pack_fn = a->pack_fn;
b->pack_params = a->pack_params;
b->ker_fn = a->ker_fn;
b->ker_params = a->ker_params;
}
// Initializors for global scalar constants.

View File

@@ -169,7 +169,6 @@ void libblis_test_gemm_ukr_experiment
num_t datatype;
dim_t m, n, k;
inc_t ldap, ldbp;
char sc_a = 'c';
char sc_b = 'r';
@@ -194,11 +193,6 @@ void libblis_test_gemm_ukr_experiment
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
// respectively.
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
// Store the register blocksizes so that the driver can retrieve the
// values later when printing results.
op->dim_aux[0] = m;
@@ -237,7 +231,13 @@ void libblis_test_gemm_ukr_experiment
libblis_test_mobj_randomize( params, TRUE, &c );
bli_copym( &c, &c_save );
#if 0
rntm_t rntm;
bli_rntm_init( &rntm );
bli_pba_rntm_set_pba( &rntm );
// Transpose B to B^T for packing.
bli_obj_induce_trans( &b );
// Create pack objects for a and b, and pack them to ap and bp,
// respectively.
cntl_t* cntl_a = libblis_test_pobj_create
@@ -248,56 +248,26 @@ void libblis_test_gemm_ukr_experiment
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
&a, &ap,
cntx
cntx,
&rntm
);
cntl_t* cntl_b = libblis_test_pobj_create
(
BLIS_KR,
BLIS_NR,
BLIS_KR,
BLIS_NO_INVERT_DIAG,
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
&b, &bp,
cntx
cntx,
&rntm
);
#endif
// Create the packed objects. Use packmr and packnr as the leading
// dimensions of ap and bp, respectively. Note that we use the ldims
// instead of the matrix dimensions for allocation purposes here.
// This is a little hacky and was prompted when trying to support
// configurations such as power9 that employ duplication/broadcasting
// of elements in one of the packed matrix objects. Thankfully, packm
// doesn't care about those dimensions and instead relies on
// information taken from the source object. Thus, this is merely
// about coaxing bli_obj_create() in allocating enough space for our
// purposes.
bli_obj_create( datatype, ldap, k, 1, ldap, &ap );
bli_obj_create( datatype, k, ldbp, ldbp, 1, &bp );
// Transpose B^T back to B and Bp^T back to Bp.
bli_obj_induce_trans( &b );
bli_obj_induce_trans( &bp );
// Set up the objects for packing. Calling packm_init_pack() does everything
// except checkout a memory pool block and save its address to the obj_t's.
// However, it does overwrite the buffer field of packed object with that of
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
// field would normally be overwritten yet again by the address from the
// memory pool block). So, we have to save the buffer address that was
// allocated so we can re-store it to the object afterward.
void* buf_ap = bli_obj_buffer( &ap );
void* buf_bp = bli_obj_buffer( &bp );
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_MR, BLIS_KR, &a, &ap, cntx );
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_KR, BLIS_NR, &b, &bp, cntx );
bli_obj_set_buffer( buf_ap, &ap );
bli_obj_set_buffer( buf_bp, &bp );
// Pack the data from the source objects.
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
// Repeat the experiment n_repeats times and record results.
// Repeat the experiment n_repeats times and record results.
for ( i = 0; i < n_repeats; ++i )
{
bli_copym( &c_save, &c );
@@ -321,16 +291,10 @@ void libblis_test_gemm_ukr_experiment
// Zero out performance and residual if output matrix is empty.
libblis_test_check_empty_problem( &c, perf, resid );
#if 0
// Free the control tree nodes and release their cached mem_t entries
// back to the memory broker.
bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
#endif
// Free the packed objects.
bli_obj_free( &ap );
bli_obj_free( &bp );
// back to the pba.
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
// Free the test objects.
bli_obj_free( &a );

View File

@@ -283,7 +283,10 @@ void libblis_test_gemmtrsm_ukr_experiment
bli_copym( &b11, &c11 );
bli_copym( &c11, &c11_save );
#if 0
rntm_t rntm;
bli_rntm_init( &rntm );
bli_pba_rntm_set_pba( &rntm );
// Create pack objects for a and b, and pack them to ap and bp,
// respectively.
cntl_t* cntl_a = libblis_test_pobj_create
@@ -294,59 +297,9 @@ void libblis_test_gemmtrsm_ukr_experiment
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
&a, &ap,
&cntx
cntx,
&rntm
);
cntl_t* cntl_b = libblis_test_pobj_create
(
BLIS_MR,
BLIS_NR,
BLIS_NO_INVERT_DIAG,
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
&b, &bp,
&cntx
);
#endif
// Create the packed objects. Use packmr and packnr as the leading
// dimensions of ap and bp, respectively. Note that we use the ldims
// instead of the matrix dimensions for allocation purposes here.
// This is a little hacky and was prompted when trying to support
// configurations such as power9 that employ duplication/broadcasting
// of elements in one of the packed matrix objects. Thankfully, packm
// doesn't care about those dimensions and instead relies on
// information taken from the source object. Thus, this is merely
// about coaxing bli_obj_create() in allocating enough space for our
// purposes.
bli_obj_create( datatype, ldap, k+m, 1, ldap, &ap );
bli_obj_create( datatype, k+m, ldbp, ldbp, 1, &bp );
// We overwrite the m dimension of ap and n dimension of bp with
// m and n, respectively, so that these objects contain the correct
// logical dimensions. Recall that ldap and ldbp were used only to
// induce bli_obj_create() to allocate sufficient memory for the
// duplication in rare instances where the subconfig uses a gemm
// ukernel that duplicates elements in one of the operands.
bli_obj_set_length( m, &ap );
bli_obj_set_width( n, &bp );
// Set up the objects for packing. Calling packm_init_pack() does everything
// except checkout a memory pool block and save its address to the obj_t's.
// However, it does overwrite the buffer field of packed object with that of
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
// field would normally be overwritten yet again by the address from the
// memory pool block). So, we have to save the buffer address that was
// allocated so we can re-store it to the object afterward.
void* buf_ap = bli_obj_buffer( &ap );
void* buf_bp = bli_obj_buffer( &bp );
bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_MR, BLIS_KR, &a, &ap, cntx );
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_KR, BLIS_NR, &b, &bp, cntx );
bli_obj_set_buffer( buf_ap, &ap );
bli_obj_set_buffer( buf_bp, &bp );
// Set the diagonal offset of ap.
if ( bli_is_lower( uploa ) ) { bli_obj_set_diag_offset( k, &ap ); }
@@ -357,32 +310,45 @@ void libblis_test_gemmtrsm_ukr_experiment
// to know how to initialize the subpartitions.
bli_obj_set_uplo( uploa, &ap );
// Pack the data from the source objects.
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
// Create subpartitions from the a and b panels.
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
&a1xp, &a11p, &bx1p, &b11p );
// Set the uplo field of a11p since the default for packed objects is
// BLIS_DENSE, and the _ukernel() wrapper needs this information to
// know which set of micro-kernels (lower or upper) to choose from.
bli_obj_set_uplo( uploa, &a11p );
#if 0
bli_printm( "a", &a, "%5.2f", "" );
bli_printm( "ap", &ap, "%5.2f", "" );
#endif
// Repeat the experiment n_repeats times and record results.
cntl_t* cntl_b = NULL;
// Repeat the experiment n_repeats times and record results.
for ( i = 0; i < n_repeats; ++i )
{
bli_copym( &c11_save, &c11 );
// Re-pack (restore) the contents of b to bp.
//bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
// Transpose B to B^T for packing.
bli_obj_induce_trans( &b );
cntl_b = libblis_test_pobj_create
(
BLIS_NR,
BLIS_MR,
BLIS_NO_INVERT_DIAG,
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
&b, &bp,
cntx,
&rntm
);
// Transpose B^T back to B and Bp^T back to Bp.
bli_obj_induce_trans( &b );
bli_obj_induce_trans( &bp );
// Create subpartitions from the a and b panels.
bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
&a1xp, &a11p, &bx1p, &b11p );
// Set the uplo field of a11p since the default for packed objects is
// BLIS_DENSE, and the _ukernel() wrapper needs this information to
// know which set of micro-kernels (lower or upper) to choose from.
bli_obj_set_uplo( uploa, &a11p );
time = bli_clock();
@@ -391,6 +357,15 @@ bli_printm( "ap", &ap, "%5.2f", "" );
cntx );
time_min = bli_clock_min_diff( time_min, time );
// On the last pass, we must keep the packed B buffer checked out in order
// to perform the correctness check later.
if ( i < n_repeats - 1 )
{
// Free the control tree nodes and release their cached mem_t entries
// back to the memory broker.
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
}
}
// Estimate the performance of the best experiment repeat.
@@ -426,16 +401,11 @@ bli_printm( "ap", &ap, "%5.2f", "" );
// Zero out performance and residual if output matrix is empty.
//libblis_test_check_empty_problem( &c11, perf, resid );
#if 0
// Free the control tree nodes and release their cached mem_t entries
// back to the memory broker.
bli_cntl_free( cntl_a, &BLIS_PACKM_SINGLE_THREADED );
bli_cntl_free( cntl_b, &BLIS_PACKM_SINGLE_THREADED );
#endif
// Free the packed objects.
bli_obj_free( &ap );
bli_obj_free( &bp );
// back to the pba.
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
if ( cntl_b )
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
// Free the test objects.
bli_obj_free( &a_big );

View File

@@ -636,7 +636,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
int i, p;
// Initialize the operation type field.
op->opid = opid;
op->opid = opid;
// Read the line for the overall operation switch.
libblis_test_read_next_line( buffer, input_stream );
@@ -671,7 +671,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
//printf( "buffer[p]: %s\n", &buffer[p] );
// Advance until we hit non-whitespace (ie: the next number).
for ( ; isspace( buffer[p] ); ++p ) ;
for ( ; isspace( buffer[p] ); ++p ) ;
//printf( "buffer[p] after: %s\n", &buffer[p] );
@@ -680,7 +680,7 @@ void libblis_test_read_op_info( test_ops_t* ops,
//printf( "dim[%d] = %d\n", i, op->dim_spec[i] );
// Advance until we hit whitespace (ie: the space before the next number).
for ( ; !isspace( buffer[p] ); ++p ) ;
for ( ; !isspace( buffer[p] ); ++p ) ;
}
}
@@ -778,11 +778,11 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
// convert these values into strings, with "unset" being used if the
// value returned was -1 (indicating the environment variable was unset).
dim_t nt = bli_thread_get_num_threads();
dim_t jc_nt = bli_thread_get_jc_nt();
dim_t pc_nt = bli_thread_get_pc_nt();
dim_t ic_nt = bli_thread_get_ic_nt();
dim_t jr_nt = bli_thread_get_jr_nt();
dim_t ir_nt = bli_thread_get_ir_nt();
dim_t jc_nt = bli_thread_get_jc_nt();
dim_t pc_nt = bli_thread_get_pc_nt();
dim_t ic_nt = bli_thread_get_ic_nt();
dim_t jr_nt = bli_thread_get_jr_nt();
dim_t ir_nt = bli_thread_get_ir_nt();
if ( nt == -1 ) sprintf( nt_str, "unset" );
else sprintf( nt_str, "%d", ( int ) nt );
@@ -1739,7 +1739,7 @@ void libblis_test_op_driver
= ( char* ) malloc( ( n_operands + 1 ) * sizeof( char ) );
for ( o = 0; o < n_operands; ++o )
{
{
unsigned int ij;
operand_t operand_type
= libblis_test_get_operand_type_for_char( o_types[o] );
@@ -2181,7 +2181,7 @@ void libblis_test_op_driver
ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype );
// Loop over the requested parameter combinations.
for ( pci = 0; pci < n_param_combos; ++pci )
for ( pci = 0; pci < n_param_combos; ++pci )
{
// Loop over the requested problem sizes.
for ( p_cur = p_first, pi = 1; p_cur <= p_max; p_cur += p_inc, ++pi )
@@ -2403,7 +2403,7 @@ void libblis_test_build_function_string
if ( strlen( funcname_str ) > MAX_FUNC_STRING_LENGTH )
libblis_test_printf_error( "Function name string length (%d) exceeds maximum (%d).\n",
strlen( funcname_str ), MAX_FUNC_STRING_LENGTH );
}
@@ -2545,7 +2545,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
dim_t n_trans = n;
dim_t rs = 1; // Initialization avoids a compiler warning.
dim_t cs = 1; // Initialization avoids a compiler warning.
// Apply the trans parameter to the dimensions (if needed).
bli_set_dims_with_trans( trans, m, n, &m_trans, &n_trans );
@@ -2591,12 +2591,9 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
}
#if 0
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm )
{
bool does_inv_diag;
rntm_t rntm;
if ( inv_diag == BLIS_NO_INVERT_DIAG ) does_inv_diag = FALSE;
else does_inv_diag = TRUE;
@@ -2606,7 +2603,6 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
(
NULL, // we don't need the small block allocator from the runtime.
NULL, // func ptr is not referenced b/c we don't call via l3 _int().
bli_packm_blk_var1,
bmult_id_m,
bmult_id_n,
does_inv_diag,
@@ -2617,20 +2613,13 @@ cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdia
NULL // no child node needed
);
// Initialize a local-to-BLIS rntm_t. This is simply so we have something
// to pass into bli_l3_packm(). The function doesn't (currently) use the
// runtime object, and even if it did, one with default values would work
// fine here.
bli_rntm_init( &rntm );
// Pack the contents of A to P.
bli_l3_packm( a, p, cntx, &rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( a, p, cntx, rntm, cntl, &BLIS_PACKM_SINGLE_THREADED );
// Return the control tree pointer so the caller can free the cntl_t and its
// mem_t entry later on.
return cntl;
}
#endif
void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x )
@@ -2975,7 +2964,7 @@ void libblis_test_parse_message( FILE* output_stream, char* message, va_list arg
char* the_string;
char the_char;
// Begin looping over message to insert variables wherever there are
// Begin looping over message to insert variables wherever there are
// format specifiers.
for ( c = 0; message[c] != '\0'; )
{

View File

@@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
// --- Create object ---
void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
cntl_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm );
void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
// --- Randomize/initialize object ---

View File

@@ -171,7 +171,6 @@ void libblis_test_trsm_ukr_experiment
num_t datatype;
dim_t m, n;
inc_t ldap, ldbp;
char sc_a = 'c';
char sc_b = 'r';
@@ -196,11 +195,6 @@ void libblis_test_trsm_ukr_experiment
m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
// respectively.
ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
// Store the register blocksizes so that the driver can retrieve the
// values later when printing results.
op->dim_aux[0] = m;
@@ -238,7 +232,10 @@ void libblis_test_trsm_ukr_experiment
libblis_test_mobj_randomize( params, TRUE, &c );
bli_copym( &c, &c_save );
#if 0
rntm_t rntm;
bli_rntm_init( &rntm );
bli_pba_rntm_set_pba( &rntm );
// Create pack objects for a and b, and pack them to ap and bp,
// respectively.
cntl_t* cntl_a = libblis_test_pobj_create
@@ -249,50 +246,9 @@ void libblis_test_trsm_ukr_experiment
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
&a, &ap,
cntx
cntx,
&rntm
);
cntl_t* cntl_b = libblis_test_pobj_create
(
BLIS_MR,
BLIS_NR,
BLIS_NO_INVERT_DIAG,
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
&b, &bp,
cntx
);
#endif
// Create the packed objects. Use packmr and packnr as the leading
// dimensions of ap and bp, respectively. Note that we use the ldims
// instead of the matrix dimensions for allocation purposes here.
// This is a little hacky and was prompted when trying to support
// configurations such as power9 that employ duplication/broadcasting
// of elements in one of the packed matrix objects. Thankfully, packm
// doesn't care about those dimensions and instead relies on
// information taken from the source object. Thus, this is merely
// about coaxing bli_obj_create() in allocating enough space for our
// purposes.
bli_obj_create( datatype, ldap, m, 1, ldap, &ap );
bli_obj_create( datatype, m, ldbp, ldbp, 1, &bp );
// Set up the objects for packing. Calling packm_init_pack() does everything
// except checkout a memory pool block and save its address to the obj_t's.
// However, it does overwrite the buffer field of packed object with that of
// the source object (as a side-effect of bli_obj_alias_to(); that buffer
// field would normally be overwritten yet again by the address from the
// memory pool block). So, we have to save the buffer address that was
// allocated so we can re-store it to the object afterward.
void* buf_ap = bli_obj_buffer( &ap );
void* buf_bp = bli_obj_buffer( &bp );
bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_MR, BLIS_KR, &a, &ap, cntx );
bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
BLIS_KR, BLIS_NR, &b, &bp, cntx );
bli_obj_set_buffer( buf_ap, &ap );
bli_obj_set_buffer( buf_bp, &bp );
// Set the diagonal offset of ap.
bli_obj_set_diag_offset( 0, &ap );
@@ -302,24 +258,35 @@ void libblis_test_trsm_ukr_experiment
// know which set of micro-kernels (lower or upper) to choose from.
bli_obj_set_uplo( uploa, &ap );
// Pack the data from the source objects.
bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
#if 0
bli_printm( "a", &a, "%5.2f", "" );
bli_printm( "ap", &ap, "%5.2f", "" );
#endif
// Repeat the experiment n_repeats times and record results.
// Repeat the experiment n_repeats times and record results.
for ( i = 0; i < n_repeats; ++i )
{
// Re-pack the contents of b to bp.
//bli_packm_blk_var1( &b, &bp, cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
bli_copym( &c_save, &c );
// Transpose B to B^T for packing.
bli_obj_induce_trans( &b );
cntl_t* cntl_b = libblis_test_pobj_create
(
BLIS_NR,
BLIS_MR,
BLIS_NO_INVERT_DIAG,
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
&b, &bp,
cntx,
&rntm
);
// Transpose B^T back to B and Bp^T back to Bp.
bli_obj_induce_trans( &b );
bli_obj_induce_trans( &bp );
time = bli_clock();
libblis_test_trsm_ukr_impl( iface, side,
@@ -327,6 +294,10 @@ bli_printm( "ap", &ap, "%5.2f", "" );
cntx );
time_min = bli_clock_min_diff( time_min, time );
// Free the control tree nodes and release their cached mem_t entries
// back to the memory broker.
bli_cntl_free( &rntm, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
}
// Estimate the performance of the best experiment repeat.
@@ -339,16 +310,9 @@ bli_printm( "ap", &ap, "%5.2f", "" );
// Zero out performance and residual if output matrix is empty.
//libblis_test_check_empty_problem( &c, perf, resid );
#if 0
// Free the control tree nodes and release their cached mem_t entries
// back to the memory broker.
bli_cntl_free( NULL, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
bli_cntl_free( NULL, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
#endif
// Free the packed objects.
bli_obj_free( &ap );
bli_obj_free( &bp );
bli_cntl_free( &rntm, cntl_a, &BLIS_PACKM_SINGLE_THREADED );
// Free the test objects.
bli_obj_free( &a );