Files
blis/frame/1m/packm/bli_packm_init.c
Field G. Van Zee c2b2ab6270 Deprecated panel stride alignment in bli_config.h.
Details:
- Removed BLIS_CONTIG_STRIDE_ALIGN_SIZE from bli_config.h of all
  configurations. It was already going unused in packm_init() since the
  recent 4m/3m commit. This setting was rarely, if ever, useful, and its
  existence only posed a potential risk for 4m/3m-based implementations.
- Removed BLIS_CONTIG_STRIDE_ALIGN_SIZE usage from mem_pool_macro_defs.h.
- Updated comments regarding CONTIG_STRIDE_ALIGN_SIZE in template
  micro-kernels.
2014-02-26 12:46:45 -06:00

487 lines
18 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packm_init( obj_t* a,
obj_t* p,
packm_t* cntl )
{
// The purpose of packm_init() is to initialize an object P so that
// a source object A can be packed into P via one of the packm
// implementations. This initialization includes acquiring a suitable
// block of memory from the memory allocator, if such a block of memory
// has not already been allocated previously.
bool_t needs_densify;
invdiag_t invert_diag;
pack_t pack_schema;
packord_t pack_ord_if_up;
packord_t pack_ord_if_lo;
packbuf_t pack_buf_type;
blksz_t* mr;
blksz_t* nr;
obj_t c;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_init_check( a, p, cntl );
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply alias the object to its packed counterpart.
if ( cntl_is_noop( cntl ) )
{
bli_obj_alias_to( *a, *p );
return;
}
// Let us now check to see if the object has already been packed. First
// we check if it has been packed to an unspecified (row or column)
// format, in which case we can alias the object and return.
// NOTE: The reason we don't need to even look at the control tree in
// this case is as follows: an object's pack status is only set to
// BLIS_PACKED_UNSPEC for situations when the actual format used is
// not important, as long as its packed into contiguous rows or
// contiguous columns. A good example of this is packing for matrix
// operands in the level-2 operations.
if ( bli_obj_pack_status( *a ) == BLIS_PACKED_UNSPEC )
{
bli_obj_alias_to( *a, *p );
return;
}
// At this point, we can be assured that cntl is not NULL. Now we check
// if the object has already been packed to the desired schema (as en-
// coded in the control tree). If so, we can alias and return, as above.
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
// and thus packing will be called for (but in some cases packing has
// already taken place, or does not need to take place, and so that will
// be indicated by the pack status). Also, not all combinations of
// current pack status and desired pack schema are valid.
if ( bli_obj_pack_status( *a ) == cntl_pack_schema( cntl ) )
{
bli_obj_alias_to( *a, *p );
return;
}
// If the object is marked as being filled with zeros, then we can skip
// the packm operation entirely and alias.
if ( bli_obj_is_zeros( *a ) )
{
bli_obj_alias_to( *a, *p );
return;
}
// Now, if we are not skipping the pack operation, then the only question
// left is whether we are to typecast matrix a before packing.
if ( bli_obj_datatype( *a ) != bli_obj_target_datatype( *a ) )
bli_abort();
/*
{
// Initialize an object c for the intermediate typecast matrix.
bli_packm_init_cast( a,
p,
&c );
// Copy/typecast matrix a to matrix c.
bli_copym( a,
&c );
}
else
*/
{
// If no cast is needed, then aliasing object c to the original
// matrix serves as a minor optimization. This causes the packm
// implementation to pack directly from matrix a.
bli_obj_alias_to( *a, c );
}
// Extract various fields from the control tree and pass them in
// explicitly into _init_pack(). This allows external code generators
// the option of bypassing usage of control trees altogether.
needs_densify = cntl_does_densify( cntl );
pack_schema = cntl_pack_schema( cntl );
pack_buf_type = cntl_pack_buf_type( cntl );
mr = cntl_mr( cntl );
nr = cntl_nr( cntl );
if ( cntl_does_invert_diag( cntl ) ) invert_diag = BLIS_INVERT_DIAG;
else invert_diag = BLIS_NO_INVERT_DIAG;
if ( cntl_rev_iter_if_upper( cntl ) ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER;
else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER;
if ( cntl_rev_iter_if_lower( cntl ) ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER;
else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER;
// Initialize object p for the final packed matrix.
bli_packm_init_pack( needs_densify,
invert_diag,
pack_schema,
pack_ord_if_up,
pack_ord_if_lo,
pack_buf_type,
mr,
nr,
&c,
p );
// Now p is ready to be packed.
}
void bli_packm_init_pack( bool_t densify,
invdiag_t invert_diag,
pack_t pack_schema,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
packbuf_t pack_buf_type,
blksz_t* mr,
blksz_t* nr,
obj_t* c,
obj_t* p )
{
num_t datatype = bli_obj_datatype( *c );
trans_t transc = bli_obj_onlytrans_status( *c );
dim_t m_c = bli_obj_length( *c );
dim_t n_c = bli_obj_width( *c );
dim_t mr_def_dim = bli_blksz_for_type( datatype, mr );
dim_t mr_ext_dim = bli_blksz_ext_for_type( datatype, mr );
dim_t nr_def_dim = bli_blksz_for_type( datatype, nr );
dim_t nr_ext_dim = bli_blksz_ext_for_type( datatype, nr );
dim_t mr_pack_dim = mr_def_dim + mr_ext_dim;
dim_t nr_pack_dim = nr_def_dim + nr_ext_dim;
mem_t* mem_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
inc_t rs_p, cs_p;
void* buf;
// We begin by copying the basic fields of c. We do NOT copy the
// pack_mem entry from c because the entry in p may be cached from
// a previous iteration, and thus we don't want to overwrite it.
bli_obj_alias_for_packing( *c, *p );
// Update the dimension fields to explicitly reflect a transposition,
// if needed.
// Then, clear the conjugation and transposition fields from the object
// since matrix packing in BLIS is deemed to take care of all conjugation
// and transposition necessary.
// Then, we adjust the properties of p when c needs a transposition.
// We negate the diagonal offset, and if c is upper- or lower-stored,
// we either toggle the uplo of p.
// Finally, if we are going to densify c, we mark p as dense.
bli_obj_set_dims_with_trans( transc, m_c, n_c, *p );
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, *p );
if ( bli_does_trans( transc ) )
{
bli_obj_negate_diag_offset( *p );
if ( bli_obj_is_upper_or_lower( *c ) )
bli_obj_toggle_uplo( *p );
}
if ( densify ) bli_obj_set_uplo( BLIS_DENSE, *p );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, *p );
// Set the invert diagonal field.
bli_obj_set_invert_diag( invert_diag, *p );
// Set the pack status of p to the pack schema prescribed in the control
// tree node.
bli_obj_set_pack_schema( pack_schema, *p );
// Set the packing order bits.
bli_obj_set_pack_order_if_upper( pack_ord_if_up, *p );
bli_obj_set_pack_order_if_lower( pack_ord_if_lo, *p );
// Extract the address of the mem_t object within p that will track
// properties of the packed buffer.
mem_p = bli_obj_pack_mem( *p );
// Compute the dimensions padded by the dimension multiples. These
// dimensions will be the dimensions of the packed matrices, including
// zero-padding, and will be used by the macro- and micro-kernels.
// We compute them by starting with the effective dimensions of c (now
// in p) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
m_p_pad = bli_align_dim_to_mult( bli_obj_length( *p ), mr_def_dim );
n_p_pad = bli_align_dim_to_mult( bli_obj_width( *p ), nr_def_dim );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
// of the zero-padded matrix.
bli_obj_set_padded_dims( m_p_pad, n_p_pad, *p );
// Now we prepare to compute strides, align them, and compute the
// total number of bytes needed for the packed buffer. After that,
// we will acquire an appropriate block of memory from the memory
// allocator.
// Extract the element size for the packed object.
elem_size_p = bli_obj_elem_size( *p );
// Set the row and column strides of p based on the pack schema.
if ( pack_schema == BLIS_PACKED_ROWS )
{
// For regular row storage, the padded width of our matrix
// should be used for the row stride, with the column stride set
// to one. By using the WIDTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the right edge of
// the matrix.
rs_p = n_p_pad;
cs_p = 1;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc rows begin at
// aligned addresses.
rs_p = bli_align_dim_to_size( rs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// Store the strides in p.
bli_obj_set_incs( rs_p, cs_p, *p );
// Compute the size of the packed buffer.
size_p = m_p_pad * rs_p * elem_size_p;
}
else if ( pack_schema == BLIS_PACKED_COLUMNS )
{
// For regular column storage, the padded length of our matrix
// should be used for the column stride, with the row stride set
// to one. By using the LENGTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the bottom edge of
// the matrix.
cs_p = m_p_pad;
rs_p = 1;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc columns begin at
// aligned addresses.
cs_p = bli_align_dim_to_size( cs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// Store the strides in p.
bli_obj_set_incs( rs_p, cs_p, *p );
// Compute the size of the packed buffer.
size_p = cs_p * n_p_pad * elem_size_p;
}
else if ( pack_schema == BLIS_PACKED_ROW_PANELS ||
pack_schema == BLIS_PACKED_ROW_PANELS_4M ||
pack_schema == BLIS_PACKED_ROW_PANELS_3M )
{
dim_t m_panel;
dim_t ps_p;
// The maximum panel length (for each datatype) should be equal to
// the register blocksize in the m dimension.
m_panel = mr_def_dim;
// The "column stride" of a row panel packed object is interpreted as
// the column stride WITHIN a panel. Thus, this is equal to the panel
// dimension plus an extension (which may be zero, meaning there is
// no extension).
cs_p = mr_pack_dim;
// The "row stride" of a row panel packed object is interpreted
// as the row stride WITHIN a panel. Thus, it is unit.
rs_p = 1;
// The "panel stride" of a panel packed object is interpreted as the
// distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each panel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last panel if the m
// dimension of the matrix is not a whole multiple of MR.
ps_p = cs_p * n_p_pad;
if ( pack_schema == BLIS_PACKED_ROW_PANELS_3M )
ps_p = ( ps_p * 3 ) / 2;
// Store the strides and panel dimension in p.
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( m_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
// Compute the size of the packed buffer.
size_p = ps_p * (m_p_pad / m_panel) * elem_size_p;
}
else if ( pack_schema == BLIS_PACKED_COL_PANELS ||
pack_schema == BLIS_PACKED_COL_PANELS_4M ||
pack_schema == BLIS_PACKED_COL_PANELS_3M )
{
dim_t n_panel;
dim_t ps_p;
// The maximum panel width (for each datatype) should be equal to
// the register blocksize in the n dimension.
n_panel = nr_def_dim;
// The "row stride" of a column panel packed object is interpreted as
// the row stride WITHIN a panel. Thus, this is equal to the panel
// dimension plus an extension (which may be zero, meaning there is
// no extension).
rs_p = nr_pack_dim;
// The "column stride" of a column panel packed object is interpreted
// as the column stride WITHIN a panel. Thus, it is unit.
cs_p = 1;
// The "panel stride" of a panel packed object is interpreted as the
// distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded length computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each panel (ie: the bottom edge of the matrix). Zero-padding
// can also occur along the long edge of the last panel if the n
// dimension of the matrix is not a whole multiple of NR.
ps_p = m_p_pad * rs_p;
if ( pack_schema == BLIS_PACKED_COL_PANELS_3M )
ps_p = ( ps_p * 3 ) / 2;
// Store the strides and panel dimension in p.
bli_obj_set_incs( rs_p, cs_p, *p );
bli_obj_set_panel_dim( n_panel, *p );
bli_obj_set_panel_stride( ps_p, *p );
// Compute the size of the packed buffer.
size_p = ps_p * (n_p_pad / n_panel) * elem_size_p;
}
else
{
// NOTE: When implementing block storage, we only need to implement
// the following two cases:
// - row-stored blocks in row-major order
// - column-stored blocks in column-major order
// The other two combinations coincide with that of packed row-panel
// and packed column- panel storage.
size_p = 0;
}
if ( bli_mem_is_unalloc( mem_p ) )
{
// If the mem_t object of p has not yet been allocated, then acquire
// a memory block of type pack_buf_type.
bli_mem_acquire_m( size_p,
pack_buf_type,
mem_p );
}
else
{
// If the mem_t object is currently allocated and smaller than is
// needed, then it must have been allocated for a different type
// of object (a different pack_buf_type value), so we must first
// release it and then re-acquire it using the new size and new
// pack_buf_type value.
if ( bli_mem_size( mem_p ) < size_p )
{
bli_mem_release( mem_p );
bli_mem_acquire_m( size_p,
pack_buf_type,
mem_p );
}
}
// Grab the buffer address from the mem_t object and copy it to the
// main object buffer field. (Sometimes this buffer address will be
// copied when the value is already up-to-date, because it persists
// in the main object buffer field across loop iterations.)
buf = bli_mem_buffer( mem_p );
bli_obj_set_buffer( buf, *p );
}
/*
void bli_packm_init_cast( obj_t* a,
obj_t* p,
obj_t* c )
{
// The idea here is that we want to create an object c that is identical
// to object a, except that:
// (1) the storage datatype of c is equal to the target datatype of a,
// with the element size of c adjusted accordingly,
// (2) the view offset of c is reset to (0,0),
// (3) object c's main buffer is set to a new memory region acquired
// from the memory manager, or extracted from p if a mem entry is
// already available, (After acquring a mem entry from the memory
// manager, it is cached within p for quick access later on.)
// (4) object c is marked as being stored in a standard, contiguous
// format (ie: a column-major order).
// Any transposition encoded within object a will not be handled here,
// but rather will be handled in the packm implementation. That way,
// the only thing castm needs to do is cast.
num_t dt_targ_a = bli_obj_target_datatype( *a );
dim_t m_a = bli_obj_length( *a );
siz_t elem_size_c = bli_datatype_size( dt_targ_a );
inc_t rs_c, cs_c;
// We begin by copying the basic fields of a.
bli_obj_alias_to( *a, *c );
// Update datatype and element size fields.
bli_obj_set_datatype( dt_targ_a, *c );
bli_obj_set_elem_size( elem_size_c, *c );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, *c );
// Check the mem_t entry of p associated with the cast buffer. If it is
// NULL, then acquire memory sufficient to hold the object data and cache
// it to p. (Otherwise, if it is non-NULL, then memory has already been
// acquired from the memory manager and cached.) We then set the main
// buffer of c to the cached address of the cast memory.
bli_obj_set_buffer_with_cached_cast_mem( *p, *c );
// Update the strides. We set the increments to reflect column-major order
// storage. We start the leading dimension out as m(a) and increment it if
// necessary so that the beginning of each column is aligned.
cs_c = bli_align_dim_to_size( m_a, elem_size_c,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
rs_c = 1;
bli_obj_set_incs( rs_c, cs_c, *c );
}
*/