mirror of
https://github.com/amd/blis.git
synced 2026-05-05 06:51:11 +00:00
Details: - Removed four trailing spaces after "BLIS" that occurs in most files' commented-out license headers. - Added UT copyright lines to some files. (These files previously had only AMD copyright lines but were contributed to by both UT and AMD.) - In some files' copyright lines, expanded 'The University of Texas' to 'The University of Texas at Austin'. - Fixed various typos/misspellings in some license headers.
546 lines
16 KiB
C
546 lines
16 KiB
C
/*
|
|
|
|
BLIS
|
|
An object-based framework for developing high-performance BLAS-like
|
|
libraries.
|
|
|
|
Copyright (C) 2014, The University of Texas at Austin
|
|
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
- Neither the name of The University of Texas at Austin nor the names
|
|
of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "blis.h"
|
|
|
|
void bli_membrk_init
|
|
(
|
|
cntx_t* cntx,
|
|
membrk_t* membrk
|
|
)
|
|
{
|
|
bli_membrk_init_mutex( membrk );
|
|
#ifdef BLIS_ENABLE_PACKBUF_POOLS
|
|
bli_membrk_init_pools( cntx, membrk );
|
|
#endif
|
|
bli_membrk_set_malloc_fp( bli_malloc_pool, membrk );
|
|
bli_membrk_set_free_fp( bli_free_pool, membrk );
|
|
}
|
|
|
|
void bli_membrk_finalize
|
|
(
|
|
membrk_t* membrk
|
|
)
|
|
{
|
|
bli_membrk_set_malloc_fp( NULL, membrk );
|
|
bli_membrk_set_free_fp( NULL, membrk );
|
|
#ifdef BLIS_ENABLE_PACKBUF_POOLS
|
|
bli_membrk_finalize_pools( membrk );
|
|
#endif
|
|
bli_membrk_finalize_mutex( membrk );
|
|
}
|
|
|
|
void bli_membrk_acquire_m
|
|
(
|
|
membrk_t* membrk,
|
|
siz_t req_size,
|
|
packbuf_t buf_type,
|
|
mem_t* mem
|
|
)
|
|
{
|
|
pool_t* pool;
|
|
pblk_t* pblk;
|
|
dim_t pi;
|
|
siz_t block_size;
|
|
|
|
// If the internal memory pools for pack buffers are disabled, we
|
|
// spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the
|
|
// immediate usage of bli_membrk_malloc().
|
|
#ifndef BLIS_ENABLE_PACKBUF_POOLS
|
|
buf_type = BLIS_BUFFER_FOR_GEN_USE;
|
|
#endif
|
|
|
|
// Make sure the API is initialized.
|
|
//assert( membrk ); //??
|
|
|
|
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
|
{
|
|
// For general-use buffer requests, such as those used by level-2
|
|
// operations, dynamically allocating memory is sufficient.
|
|
// Note that we use the malloc()-style memory allocation function
|
|
// that is stored in the membrk_t object.
|
|
void* buf_sys = bli_membrk_malloc( req_size, membrk );
|
|
|
|
// Initialize the mem_t object with:
|
|
// - the address of the memory block,
|
|
// - the buffer type (a packbuf_t value),
|
|
// - the size of the requested region,
|
|
// - the membrk_t from which the mem_t entry was acquired.
|
|
// NOTE: We initialize the pool field to NULL since this block did not
|
|
// come from a memory pool.
|
|
bli_mem_set_buffer( buf_sys, mem );
|
|
bli_mem_set_buf_sys( buf_sys, mem );
|
|
bli_mem_set_buf_type( buf_type, mem );
|
|
bli_mem_set_pool( NULL, mem );
|
|
bli_mem_set_size( req_size, mem );
|
|
bli_mem_set_membrk( membrk, mem );
|
|
}
|
|
else
|
|
{
|
|
// This branch handles cases where the memory block needs to come
|
|
// from an internal memory pool, in which blocks are allocated once
|
|
// and then recycled.
|
|
|
|
// Map the requested packed buffer type to a zero-based index, which
|
|
// we then use to select the corresponding memory pool.
|
|
pi = bli_packbuf_index( buf_type );
|
|
pool = bli_membrk_pool( pi, membrk );
|
|
|
|
// Extract the address of the pblk_t struct within the mem_t.
|
|
pblk = bli_mem_pblk( mem );
|
|
|
|
// Acquire the mutex associated with the membrk object.
|
|
bli_membrk_lock( membrk );
|
|
|
|
// BEGIN CRITICAL SECTION
|
|
{
|
|
|
|
// Checkout a block from the pool. If the pool's blocks are too
|
|
// small, it will be reinitialized with blocks large enough to
|
|
// accommodate the requested block size. If the pool is exhausted,
|
|
// either because it is still empty or because all blocks have
|
|
// been checked out already, additional blocks will be allocated
|
|
// automatically, as-needed. Note that the addresses are stored
|
|
// directly into the mem_t struct since pblk is the address of
|
|
// the struct's pblk_t field.
|
|
bli_pool_checkout_block( req_size, pblk, pool );
|
|
|
|
// Query the size of the blocks in the pool so we can store it in
|
|
// the mem_t object. At this point, it is guaranteed to be at
|
|
// least as large as req_size. (NOTE: We must perform the query
|
|
// within the critical section to ensure that the pool hasn't
|
|
// changed.)
|
|
block_size = bli_pool_block_size( pool );
|
|
|
|
}
|
|
// END CRITICAL SECTION
|
|
|
|
// Release the mutex associated with the membrk object.
|
|
bli_membrk_unlock( membrk );
|
|
|
|
// Initialize the mem_t object with:
|
|
// - the buffer type (a packbuf_t value),
|
|
// - the address of the memory pool to which it belongs,
|
|
// - the size of the contiguous memory block (NOT the size of the
|
|
// requested region),
|
|
// - the membrk_t from which the mem_t entry was acquired.
|
|
// The actual addresses (system and aligned) are already stored in
|
|
// the mem_t struct's pblk_t field
|
|
bli_mem_set_buf_type( buf_type, mem );
|
|
bli_mem_set_pool( pool, mem );
|
|
bli_mem_set_size( block_size, mem );
|
|
bli_mem_set_membrk( membrk, mem );
|
|
}
|
|
}
|
|
|
|
|
|
void bli_membrk_release
|
|
(
|
|
mem_t* mem
|
|
)
|
|
{
|
|
packbuf_t buf_type;
|
|
pool_t* pool;
|
|
pblk_t* pblk;
|
|
siz_t block_size_cur;
|
|
siz_t block_size_prev;
|
|
membrk_t* membrk;
|
|
|
|
// Extract the membrk_t address from the mem_t object.
|
|
membrk = bli_mem_membrk( mem );
|
|
|
|
// Extract the buffer type so we know what kind of memory was allocated.
|
|
buf_type = bli_mem_buf_type( mem );
|
|
|
|
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
|
{
|
|
void* buf_sys = bli_mem_buf_sys( mem );
|
|
|
|
// For general-use buffers, we dynamically allocate memory, and so
|
|
// here we need to free.
|
|
// Note that we use the free()-style memory release function that
|
|
// is stored in the membrk_t object.
|
|
bli_membrk_free( buf_sys, membrk );
|
|
}
|
|
else
|
|
{
|
|
// Extract the address of the pool from which the memory was
|
|
// allocated.
|
|
pool = bli_mem_pool( mem );
|
|
|
|
// Extract the address of the pblk_t struct within the mem_t struct.
|
|
pblk = bli_mem_pblk( mem );
|
|
|
|
// Query the size of the blocks that were in the pool at the time
|
|
// the pblk_t was checked out. (This is used below, in the critical
|
|
// section.)
|
|
block_size_prev = bli_mem_size( mem );
|
|
|
|
// BEGIN CRITICAL SECTION
|
|
bli_membrk_lock( membrk );
|
|
{
|
|
|
|
// Query the size of the blocks currently in the pool.
|
|
block_size_cur = bli_pool_block_size( pool );
|
|
|
|
// If the block size of the pool has changed since the pblk_t
|
|
// was checked out, then we need to free the pblk_t rather
|
|
// than check it back in. Why? Because the pool's block size
|
|
// has (most likely) increased to meet changing needs (example:
|
|
// larger cache blocksizes). Thus, the current pblk_t's smaller
|
|
// allocated size is of no use anymore.
|
|
if ( block_size_cur != block_size_prev )
|
|
{
|
|
// Free the pblk_t using the appropriate function in the
|
|
// pool API.
|
|
bli_pool_free_block( pblk );
|
|
}
|
|
else
|
|
{
|
|
// Check the block back into the pool.
|
|
bli_pool_checkin_block( pblk, pool );
|
|
}
|
|
|
|
}
|
|
bli_membrk_unlock( membrk );
|
|
// END CRITICAL SECTION
|
|
}
|
|
|
|
// Clear the mem_t object so that it appears unallocated. This clears:
|
|
// - the pblk_t struct's fields (ie: the buffer addresses)
|
|
// - the pool field
|
|
// - the size field
|
|
// - the membrk field
|
|
// NOTE: We do not clear the buf_type field since there is no
|
|
// "uninitialized" value for packbuf_t.
|
|
bli_mem_clear( mem );
|
|
}
|
|
|
|
|
|
void bli_membrk_acquire_v
|
|
(
|
|
membrk_t* membrk,
|
|
siz_t req_size,
|
|
mem_t* mem
|
|
)
|
|
{
|
|
bli_membrk_acquire_m( membrk,
|
|
req_size,
|
|
BLIS_BUFFER_FOR_GEN_USE,
|
|
mem );
|
|
}
|
|
|
|
|
|
siz_t bli_membrk_pool_size
|
|
(
|
|
membrk_t* membrk,
|
|
packbuf_t buf_type
|
|
)
|
|
{
|
|
siz_t r_val;
|
|
|
|
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
|
{
|
|
// We don't (yet) track the amount of general-purpose
|
|
// memory that is currently allocated.
|
|
r_val = 0;
|
|
}
|
|
else
|
|
{
|
|
dim_t pool_index;
|
|
pool_t* pool;
|
|
|
|
// Acquire the pointer to the pool corresponding to the buf_type
|
|
// provided.
|
|
pool_index = bli_packbuf_index( buf_type );
|
|
pool = bli_membrk_pool( pool_index, membrk );
|
|
|
|
// Compute the pool "size" as the product of the block size
|
|
// and the number of blocks in the pool.
|
|
r_val = bli_pool_block_size( pool ) *
|
|
bli_pool_num_blocks( pool );
|
|
}
|
|
|
|
return r_val;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
|
|
void bli_membrk_init_pools
|
|
(
|
|
cntx_t* cntx,
|
|
membrk_t* membrk
|
|
)
|
|
{
|
|
// Map each of the packbuf_t values to an index starting at zero.
|
|
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
|
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
|
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
|
|
|
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
|
|
|
// Alias the pool addresses to convenient identifiers.
|
|
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
|
|
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
|
|
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
|
|
|
|
// Start with empty pools.
|
|
const dim_t num_blocks_a = 0;
|
|
const dim_t num_blocks_b = 0;
|
|
const dim_t num_blocks_c = 0;
|
|
|
|
siz_t block_size_a = 0;
|
|
siz_t block_size_b = 0;
|
|
siz_t block_size_c = 0;
|
|
|
|
// Determine the block size for each memory pool.
|
|
bli_membrk_compute_pool_block_sizes( &block_size_a,
|
|
&block_size_b,
|
|
&block_size_c,
|
|
cntx );
|
|
|
|
// Initialize the memory pools for A, B, and C.
|
|
bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a );
|
|
bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b );
|
|
bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c );
|
|
}
|
|
|
|
void bli_membrk_finalize_pools
|
|
(
|
|
membrk_t* membrk
|
|
)
|
|
{
|
|
// Map each of the packbuf_t values to an index starting at zero.
|
|
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
|
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
|
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
|
|
|
// Alias the pool addresses to convenient identifiers.
|
|
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
|
|
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
|
|
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
|
|
|
|
// Finalize the memory pools for A, B, and C.
|
|
bli_pool_finalize( pool_a );
|
|
bli_pool_finalize( pool_b );
|
|
bli_pool_finalize( pool_c );
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
|
|
void bli_membrk_compute_pool_block_sizes
|
|
(
|
|
siz_t* bs_a,
|
|
siz_t* bs_b,
|
|
siz_t* bs_c,
|
|
cntx_t* cntx
|
|
)
|
|
{
|
|
const ind_t im = bli_cntx_method( cntx );
|
|
|
|
siz_t bs_cand_a = 0;
|
|
siz_t bs_cand_b = 0;
|
|
siz_t bs_cand_c = 0;
|
|
|
|
num_t dt;
|
|
|
|
// Compute pool block sizes for each datatype and find the maximum
|
|
// size for each pool. This is done so that new pools do not need
|
|
// to be allocated if the user switches datatypes.
|
|
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
|
|
{
|
|
siz_t bs_dt_a;
|
|
siz_t bs_dt_b;
|
|
siz_t bs_dt_c;
|
|
|
|
// Avoid considering induced methods for real datatypes.
|
|
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
|
|
|
|
bli_membrk_compute_pool_block_sizes_dt( dt,
|
|
&bs_dt_a,
|
|
&bs_dt_b,
|
|
&bs_dt_c,
|
|
cntx );
|
|
|
|
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
|
|
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
|
|
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
|
|
}
|
|
|
|
// Save the results.
|
|
*bs_a = bs_cand_a;
|
|
*bs_b = bs_cand_b;
|
|
*bs_c = bs_cand_c;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
|
|
void bli_membrk_compute_pool_block_sizes_dt
|
|
(
|
|
num_t dt,
|
|
siz_t* bs_a,
|
|
siz_t* bs_b,
|
|
siz_t* bs_c,
|
|
cntx_t* cntx
|
|
)
|
|
{
|
|
siz_t size_dt = bli_dt_size( dt );
|
|
|
|
blksz_t* mr;
|
|
blksz_t* nr;
|
|
|
|
blksz_t* mc;
|
|
blksz_t* kc;
|
|
blksz_t* nc;
|
|
|
|
dim_t mr_dt;
|
|
dim_t nr_dt;
|
|
dim_t max_mnr_dt;
|
|
|
|
dim_t mc_max_dt;
|
|
dim_t kc_max_dt;
|
|
dim_t nc_max_dt;
|
|
|
|
dim_t packmr_dt;
|
|
dim_t packnr_dt;
|
|
dim_t max_packmnr_dt;
|
|
|
|
dim_t scale_num_dt;
|
|
dim_t scale_den_dt;
|
|
|
|
dim_t pool_mc_dt, left_mc_dt;
|
|
dim_t pool_nc_dt, left_nc_dt;
|
|
dim_t pool_kc_dt;
|
|
|
|
//
|
|
// Find the larger of the two register blocksizes.
|
|
//
|
|
|
|
// Query the mr and nr blksz_t objects for the given method of
|
|
// execution.
|
|
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
|
|
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
|
|
|
|
// Extract the mr and nr values specific to the current datatype.
|
|
mr_dt = bli_blksz_get_def( dt, mr );
|
|
nr_dt = bli_blksz_get_def( dt, nr );
|
|
|
|
// Find the maximum of mr and nr.
|
|
max_mnr_dt = bli_max( mr_dt, nr_dt );
|
|
|
|
//
|
|
// Define local maximum cache blocksizes.
|
|
//
|
|
|
|
// Query the mc, kc, and nc blksz_t objects for native execution.
|
|
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
|
|
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
|
|
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
|
|
|
|
// Extract the maximum mc, kc, and nc values specific to the current
|
|
// datatype.
|
|
mc_max_dt = bli_blksz_get_max( dt, mc );
|
|
kc_max_dt = bli_blksz_get_max( dt, kc );
|
|
nc_max_dt = bli_blksz_get_max( dt, nc );
|
|
|
|
// Add max(mr,nr) to kc to make room for the nudging of kc at
|
|
// runtime to be a multiple of mr or nr for triangular operations
|
|
// trmm, trmm3, and trsm.
|
|
kc_max_dt += max_mnr_dt;
|
|
|
|
//
|
|
// Compute scaling factors.
|
|
//
|
|
|
|
// Compute integer scaling factors (numerator and denominator) used
|
|
// to account for situations when the packing register blocksizes are
|
|
// larger than the regular register blocksizes.
|
|
|
|
// In order to compute the scaling factors, we first have to determine
|
|
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
|
|
// needed ONLY because the amount of space allocated for a block of A
|
|
// and a panel of B needs to be such that MR and NR can be swapped (ie:
|
|
// A is packed with NR and B is packed with MR). This transformation is
|
|
// needed for right-side trsm when inducing an algorithm that (a) has
|
|
// favorable access patterns for column-stored C and (b) allows the
|
|
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
|
|
// kernels. We avoid integer division by cross-multiplying:
|
|
//
|
|
// ( packmr / mr ) >= ( packnr / nr )
|
|
// ( packmr / mr ) * nr >= packnr
|
|
// packmr * nr >= packnr * mr
|
|
//
|
|
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
|
|
// our scaling factors. Otherwise, we'll use packnr and nr.
|
|
|
|
packmr_dt = bli_blksz_get_max( dt, mr );
|
|
packnr_dt = bli_blksz_get_max( dt, nr );
|
|
|
|
if ( packmr_dt * nr_dt >=
|
|
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
|
|
scale_den_dt = mr_dt; }
|
|
else { scale_num_dt = packnr_dt;
|
|
scale_den_dt = nr_dt; }
|
|
|
|
//
|
|
// Compute pool block dimensions.
|
|
//
|
|
|
|
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
|
|
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
|
|
|
|
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
|
|
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
|
|
|
|
pool_kc_dt = ( kc_max_dt );
|
|
|
|
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
|
|
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
|
|
|
|
//
|
|
// Compute pool block sizes
|
|
//
|
|
|
|
// We add an extra micro-panel of space to the block sizes for A and B
|
|
// just to be sure any pre-loading performed by the micro-kernel does
|
|
// not cause a segmentation fault.
|
|
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
|
|
|
|
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
|
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
|
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
|
|
}
|