mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Details:
- Implemented a sophisticated data structure and set of APIs that track
the small blocks of memory (around 80-100 bytes each) used when
creating nodes for control and thread trees (cntl_t and thrinfo_t) as
well as thread communicators (thrcomm_t). The purpose of the small
block allocator, or sba, is to allow the library to transition into a
runtime state in which it does not perform any calls to malloc() or
free() during normal execution of level-3 operations, regardless of
the threading environment (potentially multiple application threads
as well as multiple BLIS threads). The functionality relies on a new
data structure, apool_t, which is (roughly speaking) a pool of
arrays, where each array element is a pool of small blocks. The outer
pool, which is protected by a mutex, provides separate arrays for each
application thread while the arrays each handle multiple BLIS threads
for any given application thread. The design minimizes the potential
for lock contention, as only concurrent application threads would
need to fight for the apool_t lock, and only if they happen to begin
their level-3 operations at precisely the same time. Thanks to Kiran
Varaganti and AMD for requesting this feature.
- Added a configure option to disable the sba pools, which are enabled
by default; renamed the --[dis|en]able-packbuf-pools option to
--[dis|en]able-pba-pools; and rewrote the --help text associated with
this new option and consolidated it with the --help text for the
option associated with the sba (--[dis|en]able-sba-pools).
- Moved the membrk field from the cntx_t to the rntm_t. We now pass in
a rntm_t* to the bli_membrk_acquire() and _release() APIs, just as we
do for bli_sba_acquire() and _release().
- Replaced all calls to bli_malloc_intl() and bli_free_intl() that are
used for small blocks with calls to bli_sba_acquire(), which takes a
rntm (in addition to the bytes requested), and bli_sba_release().
These latter two functions reduce to the former two when the sba pools
are disabled at configure-time.
- Added rntm_t* arguments to various cntl_t and thrinfo_t functions, as
required by the new usage of bli_sba_acquire() and _release().
- Moved the freeing of "old" blocks (those allocated prior to a change
in the block_size) from bli_membrk_acquire_m() to the implementation
of the pool_t checkout function.
- Miscellaneous improvements to the pool_t API.
- Added a block_size field to the pblk_t.
- Harmonized the way that the trsm_ukr testsuite module performs packing
relative to that of gemmtrsm_ukr, in part to avoid the need to create
a packm control tree node, which now requires a rntm_t that has been
initialized with an sba and membrk.
- Re-enable explicit call bli_finalize() in testsuite so that users who
run the testsuite with memory tracing enabled can check for memory
leaks.
- Manually imported the compact/minor changes from 61441b24 that cause
the rntm to be copied locally when it is passed in via one of the
expert APIs.
- Reordered parameters to various bli_thrcomm_*() functions so that the
thrcomm_t* to the comm being modified is last, not first.
- Added more descriptive tracing for allocating/freeing small blocks and
formalized via a new configure option: --[dis|en]able-mem-tracing.
- Moved some unused scalm code and headers into frame/1m/other.
- Whitespace changes to bli_pthread.c.
- Regenerated build/libblis-symbols.def.
418 lines
9.2 KiB
C
418 lines
9.2 KiB
C
/*
|
|
|
|
BLIS
|
|
An object-based framework for developing high-performance BLAS-like
|
|
libraries.
|
|
|
|
Copyright (C) 2018, Southern Methodist University
|
|
Copyright (C) 2018, The University of Texas at Austin
|
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "blis.h"
|
|
|
|
#include <errno.h>
|
|
|
|
#if defined(_MSC_VER)
|
|
|
|
// This branch defines a pthread-like API, bli_pthread_*(), and implements it
|
|
// in terms of Windows API calls.
|
|
|
|
int bli_pthread_mutex_init
|
|
(
|
|
bli_pthread_mutex_t* mutex,
|
|
const bli_pthread_mutexattr_t* attr
|
|
)
|
|
{
|
|
if ( attr ) return EINVAL;
|
|
InitializeSRWLock( mutex );
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_mutex_destroy
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_mutex_lock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
AcquireSRWLockExclusive( mutex );
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_mutex_trylock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return TryAcquireSRWLockExclusive( mutex ) ? 0 : EBUSY;
|
|
}
|
|
|
|
int bli_pthread_mutex_unlock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
ReleaseSRWLockExclusive( mutex );
|
|
return 0;
|
|
}
|
|
|
|
static BOOL bli_init_once_wrapper
|
|
(
|
|
bli_pthread_once_t* once,
|
|
void* param,
|
|
void** context
|
|
)
|
|
{
|
|
( void )once;
|
|
( void )context;
|
|
typedef void (*callback)( void );
|
|
((callback)param)();
|
|
return TRUE;
|
|
}
|
|
|
|
void bli_pthread_once
|
|
(
|
|
bli_pthread_once_t* once,
|
|
void (*init)(void)
|
|
)
|
|
{
|
|
InitOnceExecuteOnce( once, bli_init_once_wrapper, init, NULL );
|
|
}
|
|
|
|
int bli_pthread_cond_init
|
|
(
|
|
bli_pthread_cond_t* cond,
|
|
const bli_pthread_condattr_t* attr
|
|
)
|
|
{
|
|
if ( attr ) return EINVAL;
|
|
InitializeConditionVariable( cond );
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_cond_destroy
|
|
(
|
|
bli_pthread_cond_t* cond
|
|
)
|
|
{
|
|
( void )cond;
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_cond_wait
|
|
(
|
|
bli_pthread_cond_t* cond,
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
if ( !SleepConditionVariableSRW( cond, mutex, INFINITE, 0 ) ) return EAGAIN;
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_cond_broadcast
|
|
(
|
|
bli_pthread_cond_t* cond
|
|
)
|
|
{
|
|
WakeAllConditionVariable( cond );
|
|
return 0;
|
|
}
|
|
|
|
typedef struct
|
|
{
|
|
void* (*start_routine)( void* );
|
|
void* param;
|
|
void** retval;
|
|
|
|
} bli_thread_param;
|
|
|
|
static DWORD bli_thread_func
|
|
(
|
|
void* param_
|
|
)
|
|
{
|
|
bli_thread_param* param = param_;
|
|
*param->retval = param->start_routine( param->param );
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_create
|
|
(
|
|
bli_pthread_t* thread,
|
|
const bli_pthread_attr_t* attr,
|
|
void* (*start_routine)(void*),
|
|
void* arg
|
|
)
|
|
{
|
|
if ( attr ) return EINVAL;
|
|
bli_thread_param param = { start_routine, arg, &thread->retval };
|
|
thread->handle = CreateThread( NULL, 0, bli_thread_func, ¶m, 0, NULL );
|
|
if ( !thread->handle ) return EAGAIN;
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_join
|
|
(
|
|
bli_pthread_t thread,
|
|
void** retval
|
|
)
|
|
{
|
|
if ( !WaitForSingleObject( thread.handle, INFINITE ) ) return EAGAIN;
|
|
if ( retval ) *retval = thread.retval;
|
|
return 0;
|
|
}
|
|
|
|
#else // !defined(_MSC_VER)
|
|
|
|
// This branch defines a pthreads-like API, bli_pthreads_*(), and implements it
|
|
// in terms of the corresponding pthreads_*() types, macros, and function calls.
|
|
// This branch is compiled for Linux and other non-Windows environments where
|
|
// we assume that *some* implementation of pthreads is provided (although it
|
|
// may lack barriers--see below).
|
|
|
|
// -- pthread_create(), pthread_join() --
|
|
|
|
int bli_pthread_create
|
|
(
|
|
bli_pthread_t* thread,
|
|
const bli_pthread_attr_t* attr,
|
|
void* (*start_routine)(void*),
|
|
void* arg
|
|
)
|
|
{
|
|
return pthread_create( thread, attr, start_routine, arg );
|
|
}
|
|
|
|
int bli_pthread_join
|
|
(
|
|
bli_pthread_t thread,
|
|
void** retval
|
|
)
|
|
{
|
|
return pthread_join( thread, retval );
|
|
}
|
|
|
|
// -- pthread_mutex_*() --
|
|
|
|
int bli_pthread_mutex_init
|
|
(
|
|
bli_pthread_mutex_t* mutex,
|
|
const bli_pthread_mutexattr_t* attr
|
|
)
|
|
{
|
|
return pthread_mutex_init( mutex, attr );
|
|
}
|
|
|
|
int bli_pthread_mutex_destroy
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return pthread_mutex_destroy( mutex );
|
|
}
|
|
|
|
int bli_pthread_mutex_lock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return pthread_mutex_lock( mutex );
|
|
}
|
|
|
|
int bli_pthread_mutex_trylock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return pthread_mutex_trylock( mutex );
|
|
}
|
|
|
|
int bli_pthread_mutex_unlock
|
|
(
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return pthread_mutex_unlock( mutex );
|
|
}
|
|
|
|
// -- pthread_cond_*() --
|
|
|
|
int bli_pthread_cond_init
|
|
(
|
|
bli_pthread_cond_t* cond,
|
|
const bli_pthread_condattr_t* attr
|
|
)
|
|
{
|
|
return pthread_cond_init( cond, attr );
|
|
}
|
|
|
|
int bli_pthread_cond_destroy
|
|
(
|
|
bli_pthread_cond_t* cond
|
|
)
|
|
{
|
|
return pthread_cond_destroy( cond );
|
|
}
|
|
|
|
int bli_pthread_cond_wait
|
|
(
|
|
bli_pthread_cond_t* cond,
|
|
bli_pthread_mutex_t* mutex
|
|
)
|
|
{
|
|
return pthread_cond_wait( cond, mutex );
|
|
}
|
|
|
|
int bli_pthread_cond_broadcast
|
|
(
|
|
bli_pthread_cond_t* cond
|
|
)
|
|
{
|
|
return pthread_cond_broadcast( cond );
|
|
}
|
|
|
|
// -- pthread_once() --
|
|
|
|
void bli_pthread_once
|
|
(
|
|
bli_pthread_once_t* once,
|
|
void (*init)(void)
|
|
)
|
|
{
|
|
pthread_once( once, init );
|
|
}
|
|
|
|
#endif // _MSC_VER
|
|
|
|
|
|
// -- pthread_barrier_*() --
|
|
|
|
#if defined(__APPLE__) || defined(_MSC_VER)
|
|
|
|
// For OS X and Windows, we define barriers ourselves in terms of the rest
|
|
// of the API, though for slightly different reasons: For Windows, we must
|
|
// define barriers because we are defining *everything* from scratch. For
|
|
// OS X, we must define barriers because Apple chose to omit barriers from
|
|
// their implementation of POSIX threads (since barriers are actually
|
|
// optional to the POSIX standard).
|
|
|
|
int bli_pthread_barrier_init
|
|
(
|
|
bli_pthread_barrier_t* barrier,
|
|
const bli_pthread_barrierattr_t* attr,
|
|
unsigned int count )
|
|
{
|
|
if ( attr ) return EINVAL;
|
|
if ( count == 0 ) return EINVAL;
|
|
|
|
int err;
|
|
if ( (err = bli_pthread_mutex_init( &barrier->mutex, 0 )) != 0 ) return err;
|
|
if ( (err = bli_pthread_cond_init( &barrier->cond, 0 )) != 0 )
|
|
{
|
|
bli_pthread_mutex_destroy( &barrier->mutex );
|
|
return err;
|
|
}
|
|
barrier->tripCount = count;
|
|
barrier->count = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_barrier_destroy
|
|
(
|
|
bli_pthread_barrier_t *barrier
|
|
)
|
|
{
|
|
bli_pthread_cond_destroy( &barrier->cond );
|
|
bli_pthread_mutex_destroy( &barrier->mutex );
|
|
return 0;
|
|
}
|
|
|
|
int bli_pthread_barrier_wait
|
|
(
|
|
bli_pthread_barrier_t *barrier
|
|
)
|
|
{
|
|
bli_pthread_mutex_lock( &barrier->mutex );
|
|
++(barrier->count);
|
|
if ( barrier->count >= barrier->tripCount )
|
|
{
|
|
barrier->count = 0;
|
|
bli_pthread_cond_broadcast( &barrier->cond );
|
|
bli_pthread_mutex_unlock( &barrier->mutex );
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
bli_pthread_cond_wait( &barrier->cond, &(barrier->mutex) );
|
|
bli_pthread_mutex_unlock( &barrier->mutex );
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#else // !( defined(__APPLE__) || defined(_MSC_VER) )
|
|
|
|
// Linux environments implement the pthread_barrier* sub-API. So, if we're
|
|
// on Linux, we can simply call those functions, just as we did before for
|
|
// the other functions.
|
|
|
|
int bli_pthread_barrier_init
|
|
(
|
|
bli_pthread_barrier_t* barrier,
|
|
const bli_pthread_barrierattr_t* attr,
|
|
unsigned int count
|
|
)
|
|
{
|
|
return pthread_barrier_init( barrier, attr, count );
|
|
}
|
|
|
|
int bli_pthread_barrier_destroy
|
|
(
|
|
bli_pthread_barrier_t* barrier
|
|
)
|
|
{
|
|
return pthread_barrier_destroy( barrier );
|
|
}
|
|
|
|
int bli_pthread_barrier_wait
|
|
(
|
|
bli_pthread_barrier_t* barrier
|
|
)
|
|
{
|
|
return pthread_barrier_wait( barrier );
|
|
}
|
|
|
|
#endif // defined(__APPLE__) || defined(_MSC_VER)
|