Files
blis/addon/gemmd/bao_l3_packm_a.c
Field G. Van Zee 7a0ba4194f Added support for addons.
Details:
- Implemented a new feature called addons, which are similar to
  sandboxes except that there is no requirement to define gemm or any
  other particular operation.
- Updated configure to accept --enable-addon=<name> or -a <name> syntax
  for requesting an addon be included within a BLIS build. configure now
  outputs the list of enabled addons into config.mk. It also outputs the
  corresponding #include directives for the addons' headers to a new
  companion to the bli_config.h header file named bli_addon.h. Because
  addons may wish to make use of existing BLIS types within their own
  definitions, the addons' headers must be included sometime after that
  of bli_config.h (which currently is #included before bli_type_defs.h).
  This is why the #include directives needed to go into a new top-level
  header file rather than the existing bli_config.h file.
- Added a markdown document, docs/Addons.md, to explain addons, how to
  build with them, and what assumptions their authors should keep in
  mind as they create them.
- Added a gemmlike-like implementation of sandwich gemm called 'gemmd'
  as an addon in addon/gemmd. The code uses a 'bao_' prefix for local
  functions, including the user-level object and typed APIs.
- Updated .gitignore so that git ignores bli_addon.h files.

Change-Id: Ie7efdea366481ce25075cb2459bdbcfd52309717
2022-03-31 12:03:27 +05:30

331 lines
10 KiB
C

/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Set the pack buffer type so that we are obtaining memory blocks from
the pool dedicated to blocks of A. */ \
const packbuf_t pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK; \
\
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
const dim_t k_pack = k; \
\
/* Barrier to make sure all threads are caught up and ready to begin the
packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the packed block allocator. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was passed in.
It needs to be that mem_t struct, and not a local (temporary)
mem_t, since there is no barrier until after packing is finished,
which could allow a race condition whereby the chief thread exits
the current function before the other threads have a chance to
copy from it. (A barrier would fix that race condition, but then
again, I prefer to keep barriers to a minimum.) */ \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t to all
threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the packed
block allocator and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_pba_release \
( \
rntm, \
mem \
); \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_init_mem_a )
GENTFUNC( float, s, packm_init_mem_a )
GENTFUNC( double, d, packm_init_mem_a )
GENTFUNC( scomplex, c, packm_init_mem_a )
GENTFUNC( dcomplex, z, packm_init_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_pba_release \
( \
rntm, \
mem \
); \
} \
} \
}
//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
GENTFUNC( float, s, packm_finalize_mem_a )
GENTFUNC( double, d, packm_finalize_mem_a )
GENTFUNC( scomplex, c, packm_finalize_mem_a )
GENTFUNC( dcomplex, z, packm_finalize_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
mem_t* restrict mem \
) \
{ \
/* NOTE: This "rounding up" of the last upanel is absolutely necessary since
we NEED that last micropanel to have the same ldim (cs_p) as the other
micropanels. Why? Because the microkernel assumes that the register (MR,
NR) AND storage (PACKMR, PACKNR) blocksizes do not change. */ \
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
*k_max = k; \
\
/* Determine the dimensions and strides for the packed matrix A. */ \
{ \
/* Pack A to column-stored row-panels. */ \
*rs_p = 1; \
*cs_p = mr; \
\
*pd_p = mr; \
*ps_p = mr * k; \
\
/* Set the schema to "packed row panels" to indicate packing to
conventional column-stored row panels. */ \
*schema = BLIS_PACKED_ROW_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the memory
associated with the mem_t entry acquired from the memory pool. */ \
*p = bli_mem_buffer( mem ); \
}
//INSERT_GENTFUNC_BASIC0( packm_init_a )
GENTFUNC( float, s, packm_init_a )
GENTFUNC( double, d, packm_init_a )
GENTFUNC( scomplex, c, packm_init_a )
GENTFUNC( dcomplex, z, packm_init_a )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTECH2(bao_,ch,opname) \
( \
conj_t conj, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict d, inc_t incd, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t m_max; \
dim_t k_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. */ \
PASTECH2(bao_,ch,packm_init_mem_a) \
( \
m_alloc, k_alloc, mr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix A. */ \
PASTECH2(bao_,ch,packm_init_a) \
( \
&schema, \
m, k, mr, \
&m_max, &k_max, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
mem \
); \
\
/* Pack matrix A to the destination buffer chosen above. Here, the packed
matrix is stored to column-stored MR x k micropanels. */ \
PASTECH2(bao_,ch,packm_var1) \
( \
conj, \
schema, \
m, \
k, \
m_max, \
k_max, \
kappa, \
d, incd, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
}
//INSERT_GENTFUNC_BASIC0( packm_a )
GENTFUNC( float, s, packm_a )
GENTFUNC( double, d, packm_a )
GENTFUNC( scomplex, c, packm_a )
GENTFUNC( dcomplex, z, packm_a )