Files
blis/frame/1m/packm/bli_packm_init.c
Field G. Van Zee e9da6425e2 Allow use of 1m with mixing of row/col-pref ukrs.
Details:
- Fixed a bug that broke the use of 1m for dcomplex when the single-
  precision real and double-precision real ukernels had opposing I/O
  preferences (row-preferential sgemm ukernel + column-preferential
  dgemm ukernel, or vice versa). The fix involved adjusting the API
  to bli_cntx_set_ind_blkszs() so that the induced method context init
  function (e.g., bli_cntx_init_<subconfig>_ind()) could call that
  function for only one datatype at a time. This allowed the blocksize
  scaling (which varies depending on whether we're doing 1m_r or 1m_c)
  to happen on a per-datatype basis. This fixes issue #557. Thanks to
  Devin Matthews and RuQing Xu for helping discover and report this bug.
- The aforementioned 1m fix required moving the 1m_r/1m_c logic from
  bli_cntx_ref.c into a new function, bli_l3_set_schemas(), which is
  called from each level-3 _front() function. The pack_t schemas in the
  cntx_t were also removed entirely, along with the associated accessor
  functions. This in turn required updating the trsm1m-related virtual
  ukernels to read the pack schema for B from the auxinfo_t struct
  rather than the context. This also required slight tweaks to
  bli_gemm_md.c.
- Repositioned the logic for transposing the operation to accommodate
  the microkernel IO preference. This mostly only affects gemm. Thanks
  to Devin Matthews for his help with this.
- Updated dpackm pack ukernels in the 'armsve' kernel set to avoid
  querying pack_t schemas from the context.
- Removed the num_t dt argument from the ind_cntx_init_ft type defined
  in bli_gks.c. The context initialization functions for induced methods
  were previously passed a dt argument, but I can no longer figure out
  *why* they were passed this value. To reduce confusion, I've removed
  the dt argument (including also from the function defintion +
  prototype).
- Commented out setting of cntx_t schemas in bli_cntx_ind_stage.c. This
  breaks high-leve implementations of 3m and 4m, but this is okay since
  those implementations will be removed very soon.
- Removed some older blocks of preprocessor-disabled code.
- Comment update to test_libblis.c.
2021-10-13 14:15:38 -05:00

517 lines
20 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
siz_t bli_packm_init
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl
)
{
bli_init_once();
// The purpose of packm_init() is to initialize an object P so that
// a source object A can be packed into P via one of the packm
// implementations. This initialization precedes the acquisition of a
// suitable block of memory from the memory allocator (if such a block
// of memory has not already been allocated previously).
bszid_t bmult_id_m;
bszid_t bmult_id_n;
bool does_invert_diag;
bool rev_iter_if_upper;
bool rev_iter_if_lower;
pack_t schema;
//packbuf_t pack_buf_type;
siz_t size_needed;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_init_check( a, p, cntx );
// Extract various fields from the control tree.
bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
does_invert_diag = bli_cntl_packm_params_does_invert_diag( cntl );
rev_iter_if_upper = bli_cntl_packm_params_rev_iter_if_upper( cntl );
rev_iter_if_lower = bli_cntl_packm_params_rev_iter_if_lower( cntl );
schema = bli_cntl_packm_params_pack_schema( cntl );
//pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
#if 0
// Let us now check to see if the object has already been packed. First
// we check if it has been packed to an unspecified (row or column)
// format, in which case we can alias the object and return.
// NOTE: The reason we don't need to even look at the control tree in
// this case is as follows: an object's pack status is only set to
// BLIS_PACKED_UNSPEC for situations when the actual format used is
// not important, as long as its packed into contiguous rows or
// contiguous columns. A good example of this is packing for matrix
// operands in the level-2 operations.
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
{
bli_obj_alias_to( a, p );
return 0;
}
// Now we check if the object has already been packed to the desired
// schema (as encoded in the control tree). If so, we can alias and
// return 0.
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
// and thus packing will be called for (but in some cases packing has
// already taken place, or does not need to take place, and so that will
// be indicated by the pack status). Also, not all combinations of
// current pack status and desired pack schema are valid.
if ( bli_obj_pack_schema( a ) == pack_schema )
{
bli_obj_alias_to( a, p );
return 0;
}
#endif
// If the object is marked as being filled with zeros, then we can skip
// the packm operation entirely and alias.
if ( bli_obj_is_zeros( a ) )
{
bli_obj_alias_to( a, p );
return 0;
}
// Prepare a few other variables based on properties of the control
// tree.
invdiag_t invert_diag;
packord_t pack_ord_if_up;
packord_t pack_ord_if_lo;
if ( does_invert_diag ) invert_diag = BLIS_INVERT_DIAG;
else invert_diag = BLIS_NO_INVERT_DIAG;
if ( rev_iter_if_upper ) pack_ord_if_up = BLIS_PACK_REV_IF_UPPER;
else pack_ord_if_up = BLIS_PACK_FWD_IF_UPPER;
if ( rev_iter_if_lower ) pack_ord_if_lo = BLIS_PACK_REV_IF_LOWER;
else pack_ord_if_lo = BLIS_PACK_FWD_IF_LOWER;
// Initialize object p for the final packed matrix.
size_needed
=
bli_packm_init_pack
(
invert_diag,
schema,
pack_ord_if_up,
pack_ord_if_lo,
bmult_id_m,
bmult_id_n,
a,
p,
cntx
);
// Return the size needed for memory allocation of the packed buffer.
return size_needed;
}
siz_t bli_packm_init_pack
(
invdiag_t invert_diag,
pack_t schema,
packord_t pack_ord_if_up,
packord_t pack_ord_if_lo,
bszid_t bmult_id_m,
bszid_t bmult_id_n,
obj_t* a,
obj_t* p,
cntx_t* cntx
)
{
bli_init_once();
num_t dt_tar = bli_obj_target_dt( a );
num_t dt_scalar = bli_obj_scalar_dt( a );
trans_t transa = bli_obj_onlytrans_status( a );
dim_t m_a = bli_obj_length( a );
dim_t n_a = bli_obj_width( a );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_n, cntx );
dim_t m_p, n_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
inc_t rs_p, cs_p;
inc_t is_p;
// We begin by copying the fields of A.
bli_obj_alias_to( a, p );
// Typecast the internal scalar value to the target datatype.
// Note that if the typecasting is needed, this must happen BEFORE we
// change the datatype of P to reflect the target_dt.
if ( dt_scalar != dt_tar )
{
bli_obj_scalar_cast_to( dt_tar, p );
}
// Update the storage datatype of P to be the target datatype of A.
bli_obj_set_dt( dt_tar, p );
// Update the dimension fields to explicitly reflect a transposition,
// if needed.
// Then, clear the conjugation and transposition fields from the object
// since matrix packing in BLIS is deemed to take care of all conjugation
// and transposition necessary.
// Then, we adjust the properties of P when A needs a transposition.
// We negate the diagonal offset, and if A is upper- or lower-stored,
// we either toggle the uplo of P.
// Finally, if we mark P as dense since we assume that all matrices,
// regardless of structure, will be densified.
bli_obj_set_dims_with_trans( transa, m_a, n_a, p );
bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, p );
if ( bli_does_trans( transa ) )
{
bli_obj_negate_diag_offset( p );
if ( bli_obj_is_upper_or_lower( a ) )
bli_obj_toggle_uplo( p );
}
// If we are packing micropanels, mark P as dense. Otherwise, we are
// probably being called in the context of a level-2 operation, in
// which case we do not want to overwrite the uplo field of P (inherited
// from A) with BLIS_DENSE because that information may be needed by
// the level-2 operation's unblocked variant to decide whether to
// execute a "lower" or "upper" branch of code.
if ( bli_is_panel_packed( schema ) )
{
bli_obj_set_uplo( BLIS_DENSE, p );
}
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, p );
// Set the invert diagonal field.
bli_obj_set_invert_diag( invert_diag, p );
// Set the pack status of P to the pack schema prescribed in the control
// tree node.
bli_obj_set_pack_schema( schema, p );
// Set the packing order bits.
bli_obj_set_pack_order_if_upper( pack_ord_if_up, p );
bli_obj_set_pack_order_if_lower( pack_ord_if_lo, p );
// Compute the dimensions padded by the dimension multiples. These
// dimensions will be the dimensions of the packed matrices, including
// zero-padding, and will be used by the macro- and micro-kernels.
// We compute them by starting with the effective dimensions of A (now
// in P) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
m_p = bli_obj_length( p );
n_p = bli_obj_width( p );
m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
// of the zero-padded matrix.
bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
// Now we prepare to compute strides, align them, and compute the
// total number of bytes needed for the packed buffer. The caller
// will then use that value to acquire an appropriate block of memory
// from the memory allocator.
// Extract the element size for the packed object.
elem_size_p = bli_obj_elem_size( p );
// Set the row and column strides of p based on the pack schema.
if ( bli_is_row_packed( schema ) &&
!bli_is_panel_packed( schema ) )
{
// For regular row storage, the padded width of our matrix
// should be used for the row stride, with the column stride set
// to one. By using the WIDTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the right edge of
// the matrix.
rs_p = n_p_pad;
cs_p = 1;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc rows begin at
// aligned addresses.
rs_p = bli_align_dim_to_size( rs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// Store the strides in P.
bli_obj_set_strides( rs_p, cs_p, p );
// Compute the size of the packed buffer.
size_p = m_p_pad * rs_p * elem_size_p;
}
else if ( bli_is_col_packed( schema ) &&
!bli_is_panel_packed( schema ) )
{
// For regular column storage, the padded length of our matrix
// should be used for the column stride, with the row stride set
// to one. By using the LENGTH of the mem_t region, we allow for
// zero-padding (if necessary/desired) along the bottom edge of
// the matrix.
cs_p = m_p_pad;
rs_p = 1;
// Align the leading dimension according to the heap stride
// alignment size so that the second, third, etc columns begin at
// aligned addresses.
cs_p = bli_align_dim_to_size( cs_p, elem_size_p,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
// Store the strides in P.
bli_obj_set_strides( rs_p, cs_p, p );
// Compute the size of the packed buffer.
size_p = cs_p * n_p_pad * elem_size_p;
}
else if ( bli_is_row_packed( schema ) &&
bli_is_panel_packed( schema ) )
{
dim_t m_panel;
dim_t ps_p, ps_p_orig;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the m dimension.
m_panel = bmult_m_def;
// The "column stride" of a row-micropanel packed object is interpreted
// as the column stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple, which may be equal to the
// default (logical) blocksize multiple).
cs_p = bmult_m_pack;
// The "row stride" of a row-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, it is unit.
rs_p = 1;
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the m
// dimension of the matrix is not a whole multiple of MR.
ps_p = cs_p * n_p_pad;
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Preserve this early panel stride value for use later, if needed.
ps_p_orig = ps_p;
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
// the correct amount.
if ( bli_is_3mi_packed( schema ) )
{
ps_p = ( ps_p * 3 ) / 2;
}
else if ( bli_is_3ms_packed( schema ) ||
bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) )
{
// The division by 2 below assumes that ps_p is an even number.
// However, it is possible that, at this point, ps_p is an odd.
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
}
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( m_p_pad / m_panel );
else is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( m_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_panel, p );
bli_obj_set_panel_width( n_p, p );
// Compute the size of the packed buffer.
size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
}
else if ( bli_is_col_packed( schema ) &&
bli_is_panel_packed( schema ) )
{
dim_t n_panel;
dim_t ps_p, ps_p_orig;
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the n dimension.
n_panel = bmult_n_def;
// The "row stride" of a column-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple (which may be equal to the
// default (logical) blocksize multiple.
rs_p = bmult_n_pack;
// The "column stride" of a column-micropanel packed object is
// interpreted as the column stride WITHIN a micropanel. Thus, it is
// unit.
cs_p = 1;
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded length computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the bottom edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the n
// dimension of the matrix is not a whole multiple of NR.
ps_p = m_p_pad * rs_p;
// As a general rule, we don't want micropanel strides to be odd. This
// is primarily motivated by our desire to support interleaved 3m
// micropanels, in which case we have to scale the panel stride
// by 3/2. That division by 2 means the numerator (prior to being
// scaled by 3) must be even.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Preserve this early panel stride value for use later, if needed.
ps_p_orig = ps_p;
// Here, we adjust the panel stride, if necessary. Remember: ps_p is
// always interpreted as being in units of the datatype of the object
// which is not necessarily how the micropanels will be stored. For
// interleaved 3m, we will increase ps_p by 50%, and for ro/io/rpi,
// we halve ps_p. Why? Because the macro-kernel indexes in units of
// the complex datatype. So these changes "trick" it into indexing
// the correct amount.
if ( bli_is_3mi_packed( schema ) )
{
ps_p = ( ps_p * 3 ) / 2;
}
else if ( bli_is_3ms_packed( schema ) ||
bli_is_ro_packed( schema ) ||
bli_is_io_packed( schema ) ||
bli_is_rpi_packed( schema ) )
{
// The division by 2 below assumes that ps_p is an even number.
// However, it is possible that, at this point, ps_p is an odd.
// If it is indeed odd, we nudge it higher.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Despite the fact that the packed micropanels will contain
// real elements, the panel stride that we store in the obj_t
// (which is passed into the macro-kernel) needs to be in units
// of complex elements, since the macro-kernel will index through
// micropanels via complex pointer arithmetic for trmm/trsm.
// Since the indexing "increment" will be twice as large as each
// actual stored element, we divide the panel_stride by 2.
ps_p = ps_p / 2;
}
// Set the imaginary stride (in units of fundamental elements) for
// 3m and 4m (separated or interleaved). We use ps_p_orig since
// that variable tracks the number of real part elements contained
// within each micropanel of the source matrix. Therefore, this
// is the number of real elements that must be traversed before
// reaching the imaginary part (3mi/4mi) of the packed micropanel,
// or the real part of the next micropanel (3ms).
if ( bli_is_3mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_4mi_packed( schema ) ) is_p = ps_p_orig;
else if ( bli_is_3ms_packed( schema ) ) is_p = ps_p_orig * ( n_p_pad / n_panel );
else is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( n_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_p, p );
bli_obj_set_panel_width( n_panel, p );
// Compute the size of the packed buffer.
size_p = ps_p * ( n_p_pad / n_panel ) * elem_size_p;
}
else
{
// NOTE: When implementing block storage, we only need to implement
// the following two cases:
// - row-stored blocks in row-major order
// - column-stored blocks in column-major order
// The other two combinations coincide with that of packed row-panel
// and packed column- panel storage.
size_p = 0;
}
return size_p;
}