mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Merge remote-tracking branch 'publicrepo/master'
This commit is contained in:
@@ -170,8 +170,8 @@ ifeq ($(THREADING_MODEL),auto)
|
||||
THREADING_MODEL := omp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),omp)
|
||||
CTHREADFLAGS := -openmp
|
||||
LDFLAGS += -openmp
|
||||
CTHREADFLAGS := -fopenmp
|
||||
LDFLAGS += -fopenmp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),pthreads)
|
||||
CTHREADFLAGS := -pthread
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -43,6 +44,7 @@ void bli_packv_init
|
||||
)
|
||||
{
|
||||
// The purpose of packm_init() is to initialize an object P so that
|
||||
|
||||
// a source object A can be packed into P via one of the packv
|
||||
// implementations. This initialization includes acquiring a suitable
|
||||
// block of memory from the memory allocator, if such a block of memory
|
||||
@@ -132,15 +134,17 @@ void bli_packv_init_pack
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
dim_t dim_c = bli_obj_vector_dim( *c );
|
||||
dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
dim_t dim_c = bli_obj_vector_dim( *c );
|
||||
dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p_pad;
|
||||
siz_t size_p;
|
||||
inc_t rs_p, cs_p;
|
||||
void* buf;
|
||||
membrk_t* membrk = bli_cntx_membrk( cntx );
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p_pad;
|
||||
siz_t size_p;
|
||||
inc_t rs_p, cs_p;
|
||||
void* buf;
|
||||
|
||||
|
||||
// We begin by copying the basic fields of c.
|
||||
@@ -170,8 +174,9 @@ void bli_packv_init_pack
|
||||
{
|
||||
// If the mem_t object of p has not yet been allocated, then acquire
|
||||
// a memory block suitable for a vector.
|
||||
bli_mem_acquire_v( size_p,
|
||||
mem_p );
|
||||
bli_membrk_acquire_v( membrk,
|
||||
size_p,
|
||||
mem_p );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -179,10 +184,11 @@ void bli_packv_init_pack
|
||||
// re-acquire the memory so there is sufficient space.
|
||||
if ( bli_mem_size( mem_p ) < size_p )
|
||||
{
|
||||
bli_mem_release( mem_p );
|
||||
bli_membrk_release( mem_p );
|
||||
|
||||
bli_mem_acquire_v( size_p,
|
||||
mem_p );
|
||||
bli_membrk_acquire_v( membrk,
|
||||
size_p,
|
||||
mem_p );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,7 +224,7 @@ void bli_packv_release
|
||||
)
|
||||
{
|
||||
if ( !bli_cntl_is_noop( cntl ) )
|
||||
bli_obj_release_pack( p );
|
||||
bli_obj_release_pack( p );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -49,6 +50,9 @@ void bli_packm_cntx_init( cntx_t* cntx )
|
||||
bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );
|
||||
bli_gks_cntx_set_l1v_ker( BLIS_SCAL2V_KER, cntx );
|
||||
bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );
|
||||
|
||||
// Initialize the context with the global membrk object.
|
||||
bli_cntx_set_membrk( bli_mem_global_membrk(), cntx );
|
||||
}
|
||||
|
||||
void bli_packm_cntx_finalize( cntx_t* cntx )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -202,23 +203,25 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
obj_t* p,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
trans_t transc = bli_obj_onlytrans_status( *c );
|
||||
dim_t m_c = bli_obj_length( *c );
|
||||
dim_t n_c = bli_obj_width( *c );
|
||||
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
|
||||
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
|
||||
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx );
|
||||
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx );
|
||||
num_t dt = bli_obj_datatype( *c );
|
||||
trans_t transc = bli_obj_onlytrans_status( *c );
|
||||
dim_t m_c = bli_obj_length( *c );
|
||||
dim_t n_c = bli_obj_width( *c );
|
||||
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
|
||||
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
|
||||
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx );
|
||||
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx );
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p, n_p;
|
||||
dim_t m_p_pad, n_p_pad;
|
||||
siz_t size_p;
|
||||
siz_t elem_size_p;
|
||||
inc_t rs_p, cs_p;
|
||||
inc_t is_p;
|
||||
void* buf;
|
||||
membrk_t* membrk = bli_cntx_get_membrk( cntx );
|
||||
|
||||
mem_t* mem_p;
|
||||
dim_t m_p, n_p;
|
||||
dim_t m_p_pad, n_p_pad;
|
||||
siz_t size_p;
|
||||
siz_t elem_size_p;
|
||||
inc_t rs_p, cs_p;
|
||||
inc_t is_p;
|
||||
void* buf;
|
||||
|
||||
|
||||
// We begin by copying the basic fields of c. We do NOT copy the
|
||||
@@ -549,9 +552,10 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
{
|
||||
// If the mem_t object of p has not yet been allocated, then acquire
|
||||
// a memory block of type pack_buf_type.
|
||||
bli_mem_acquire_m( size_p,
|
||||
pack_buf_type,
|
||||
mem_p );
|
||||
bli_membrk_acquire_m( membrk,
|
||||
size_p,
|
||||
pack_buf_type,
|
||||
mem_p );
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -562,10 +566,11 @@ void bli_packm_init_pack( invdiag_t invert_diag,
|
||||
// pack_buf_type value.
|
||||
if ( bli_mem_size( mem_p ) < size_p )
|
||||
{
|
||||
bli_mem_release( mem_p );
|
||||
bli_mem_acquire_m( size_p,
|
||||
pack_buf_type,
|
||||
mem_p );
|
||||
bli_membrk_release( mem_p );
|
||||
bli_membrk_acquire_m( membrk,
|
||||
size_p,
|
||||
pack_buf_type,
|
||||
mem_p );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -582,7 +587,7 @@ void bli_packm_release( obj_t* p,
|
||||
packm_t* cntl )
|
||||
{
|
||||
if ( !bli_cntl_is_noop( cntl ) )
|
||||
bli_obj_release_pack( p );
|
||||
bli_obj_release_pack( p );
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,6 +58,7 @@ typedef struct cntx_s
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
membrk_t* membrk;
|
||||
} cntx_t;
|
||||
*/
|
||||
|
||||
@@ -116,66 +118,75 @@ typedef struct cntx_s
|
||||
\
|
||||
( (cntx)->schema_c )
|
||||
|
||||
#define bli_cntx_membrk( cntx ) \
|
||||
\
|
||||
( (cntx)->membrk )
|
||||
|
||||
// cntx_t modification (fields only)
|
||||
|
||||
#define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->blkszs = _blkszs; \
|
||||
(cntx_p)->blkszs = _blkszs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_bmults_buf( _bmults, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->bmults = _bmults; \
|
||||
(cntx_p)->bmults = _bmults; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_l3_vir_ukrs_buf( _l3_vir_ukrs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \
|
||||
(cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_l3_nat_ukrs_buf( _l3_nat_ukrs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \
|
||||
(cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_l3_nat_ukrs_prefs_buf( _l3_nat_ukrs_prefs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \
|
||||
(cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_l1f_kers_buf( _l1f_kers, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->l1f_kers = _l1f_kers; \
|
||||
(cntx_p)->l1f_kers = _l1f_kers; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_l1v_kers_buf( _l1v_kers, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->l1v_kers = _l1v_kers; \
|
||||
(cntx_p)->l1v_kers = _l1v_kers; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->packm_ukrs = _packm_ukrs; \
|
||||
(cntx_p)->packm_ukrs = _packm_ukrs; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_method( _method, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->method = _method; \
|
||||
(cntx_p)->method = _method; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_a = _schema_a; \
|
||||
(cntx_p)->schema_a = _schema_a; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_b = _schema_b; \
|
||||
(cntx_p)->schema_b = _schema_b; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->schema_c = _schema_c; \
|
||||
(cntx_p)->schema_c = _schema_c; \
|
||||
}
|
||||
|
||||
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
|
||||
{ \
|
||||
(cntx_p)->membrk = _membrk; \
|
||||
}
|
||||
|
||||
// cntx_t query (complex)
|
||||
@@ -264,6 +275,11 @@ typedef struct cntx_s
|
||||
\
|
||||
bli_cntx_schema_b( cntx )
|
||||
|
||||
#define bli_cntx_get_membrk( cntx ) \
|
||||
\
|
||||
bli_cntx_membrk( cntx )
|
||||
|
||||
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -38,207 +39,15 @@
|
||||
pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
#endif
|
||||
|
||||
// Declare one memory pool structure for each block size/shape we want to
|
||||
// be able to allocate.
|
||||
static pool_t pools[3];
|
||||
|
||||
|
||||
static membrk_t global_membrk;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_acquire_m( siz_t req_size,
|
||||
packbuf_t buf_type,
|
||||
mem_t* mem )
|
||||
membrk_t* bli_mem_global_membrk( void )
|
||||
{
|
||||
pool_t* pool;
|
||||
pblk_t* pblk;
|
||||
dim_t pi;
|
||||
siz_t block_size;
|
||||
|
||||
// Make sure the API is initialized.
|
||||
bli_mem_init();
|
||||
|
||||
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
||||
{
|
||||
// For general-use buffer requests, such as those used by level-2
|
||||
// operations, dynamically allocating memory is sufficient.
|
||||
void* buf_sys = bli_malloc_pool( req_size );
|
||||
|
||||
// Initialize the mem_t object with:
|
||||
// - the address of the memory block,
|
||||
// - the buffer type (a packbuf_t value), and
|
||||
// - the size of the requested region.
|
||||
// NOTE: We do not initialize the pool field since this block did not
|
||||
// come from a memory pool.
|
||||
bli_mem_set_buffer( buf_sys, mem );
|
||||
bli_mem_set_buf_sys( buf_sys, mem );
|
||||
bli_mem_set_buf_type( buf_type, mem );
|
||||
bli_mem_set_size( req_size, mem );
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles cases where the memory block needs to come
|
||||
// from an internal memory pool, in which blocks are allocated once
|
||||
// and then recycled.
|
||||
|
||||
// Map the requested packed buffer type to a zero-based index, which
|
||||
// we then use to select the corresponding memory pool.
|
||||
pi = bli_packbuf_index( buf_type );
|
||||
pool = &pools[ pi ];
|
||||
|
||||
// Unconditionally perform error checking on the memory pool.
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
// Make sure that the requested matrix size fits inside of a block
|
||||
// of the corresponding pool. If it does not, the pool was somehow
|
||||
// initialized improperly.
|
||||
e_val = bli_check_requested_block_size_for_pool( req_size, pool );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
// Extract the address of the pblk_t struct within the mem_t.
|
||||
pblk = bli_mem_pblk( mem );
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
_Pragma( "omp critical (mem)" )
|
||||
#endif
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
pthread_mutex_lock( &mem_manager_mutex );
|
||||
#endif
|
||||
// BEGIN CRITICAL SECTION
|
||||
{
|
||||
|
||||
// Checkout a block from the pool. If the pool is exhausted,
|
||||
// either because it is still empty or because all blocks have
|
||||
// been checked out already, additional blocks will be allocated
|
||||
// automatically, as-needed. Note that the addresses are stored
|
||||
// directly into the mem_t struct since pblk is the address of
|
||||
// the struct's pblk_t field.
|
||||
bli_pool_checkout_block( pblk, pool );
|
||||
|
||||
// Query the size of the blocks in the pool so we can store it in
|
||||
// the mem_t object. At this point, it is guaranteed to be at
|
||||
// least as large as req_size. (NOTE: We must perform the query
|
||||
// within the critical section to ensure that the pool hasn't
|
||||
// changed, as unlikely as that would be.)
|
||||
block_size = bli_pool_block_size( pool );
|
||||
|
||||
}
|
||||
// END CRITICAL SECTION
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
pthread_mutex_unlock( &mem_manager_mutex );
|
||||
#endif
|
||||
|
||||
// Initialize the mem_t object with:
|
||||
// - the buffer type (a packbuf_t value),
|
||||
// - the address of the memory pool to which it belongs, and
|
||||
// - the size of the contiguous memory block (NOT the size of the
|
||||
// requested region).
|
||||
// The actual addresses (system and aligned) are already stored in
|
||||
// the mem_t struct's pblk_t field
|
||||
bli_mem_set_buf_type( buf_type, mem );
|
||||
bli_mem_set_pool( pool, mem );
|
||||
bli_mem_set_size( block_size, mem );
|
||||
}
|
||||
return &global_membrk;
|
||||
}
|
||||
|
||||
|
||||
void bli_mem_release( mem_t* mem )
|
||||
{
|
||||
packbuf_t buf_type;
|
||||
pool_t* pool;
|
||||
pblk_t* pblk;
|
||||
siz_t block_size_cur;
|
||||
siz_t block_size_prev;
|
||||
|
||||
// Make sure the API is initialized.
|
||||
bli_mem_init();
|
||||
|
||||
// Extract the buffer type so we know what kind of memory was allocated.
|
||||
buf_type = bli_mem_buf_type( mem );
|
||||
|
||||
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
||||
{
|
||||
void* buf_sys = bli_mem_buf_sys( mem );
|
||||
|
||||
// For general-use buffers, we dynamically allocate memory, and so
|
||||
// here we need to free.
|
||||
bli_free_pool( buf_sys );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Extract the address of the pool from which the memory was
|
||||
// allocated.
|
||||
pool = bli_mem_pool( mem );
|
||||
|
||||
// Extract the address of the pblk_t struct within the mem_t struct.
|
||||
pblk = bli_mem_pblk( mem );
|
||||
|
||||
// Query the size of the blocks that were in the pool at the time
|
||||
// the pblk_t was checked out. (This is used below, in the critical
|
||||
// section.)
|
||||
block_size_prev = bli_mem_size( mem );
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
_Pragma( "omp critical (mem)" )
|
||||
#endif
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
pthread_mutex_lock( &mem_manager_mutex );
|
||||
#endif
|
||||
|
||||
// BEGIN CRITICAL SECTION
|
||||
{
|
||||
|
||||
// Query the size of the blocks currently in the pool.
|
||||
block_size_cur = bli_pool_block_size( pool );
|
||||
|
||||
// If the block size of the pool has changed since the pblk_t
|
||||
// was checked out, then we need to free the pblk_t rather
|
||||
// than check it back in. Why? Because the pool's block size
|
||||
// has (most likely) increased to meet changing needs (example:
|
||||
// larger cache blocksizes). Thus, the current pblk_t's smaller
|
||||
// allocated size is of no use anymore.
|
||||
if ( block_size_cur != block_size_prev )
|
||||
{
|
||||
// Free the pblk_t using the appropriate function in the
|
||||
// pool API.
|
||||
bli_pool_free_block( pblk );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check the block back into the pool.
|
||||
bli_pool_checkin_block( pblk, pool );
|
||||
}
|
||||
|
||||
}
|
||||
// END CRITICAL SECTION
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
pthread_mutex_unlock( &mem_manager_mutex );
|
||||
#endif
|
||||
}
|
||||
|
||||
// Clear the mem_t object so that it appears unallocated. This clears:
|
||||
// - the pblk_t struct's fields (ie: the buffer addresses)
|
||||
// - the pool field
|
||||
// - the size field
|
||||
// NOTE: We do not clear the buf_type field since there is no
|
||||
// "uninitialized" value for packbuf_t.
|
||||
bli_mem_clear( mem );
|
||||
}
|
||||
|
||||
|
||||
void bli_mem_acquire_v( siz_t req_size,
|
||||
mem_t* mem )
|
||||
{
|
||||
bli_mem_acquire_m( req_size,
|
||||
BLIS_BUFFER_FOR_GEN_USE,
|
||||
mem );
|
||||
}
|
||||
|
||||
|
||||
siz_t bli_mem_pool_size( packbuf_t buf_type )
|
||||
{
|
||||
siz_t r_val;
|
||||
@@ -251,15 +60,15 @@ siz_t bli_mem_pool_size( packbuf_t buf_type )
|
||||
}
|
||||
else
|
||||
{
|
||||
dim_t index;
|
||||
dim_t pool_index;
|
||||
pool_t* pool;
|
||||
|
||||
// Acquire the pointer to the pool corresponding to the buf_type
|
||||
// provided.
|
||||
index = bli_packbuf_index( buf_type );
|
||||
pool = &(pools[index]);
|
||||
pool_index = bli_packbuf_index( buf_type );
|
||||
pool = bli_membrk_pool( pool_index, &global_membrk );
|
||||
|
||||
// Compute the pool "size" as the product of the block size
|
||||
// Compute the pool "size" as the product of the block size
|
||||
// and the number of blocks in the pool.
|
||||
r_val = bli_pool_block_size( pool ) *
|
||||
bli_pool_num_blocks( pool );
|
||||
@@ -300,8 +109,8 @@ void bli_mem_init( void )
|
||||
// critical section.
|
||||
if ( bli_mem_is_init == FALSE )
|
||||
{
|
||||
// Initialize the memory pools.
|
||||
bli_mem_init_pools( &cntx );
|
||||
// Initialize the global membrk_t object and its memory pools.
|
||||
bli_membrk_init( &cntx, &global_membrk );
|
||||
|
||||
// After initialization, mark the API as initialized.
|
||||
bli_mem_is_init = TRUE;
|
||||
@@ -332,16 +141,16 @@ void bli_mem_reinit( cntx_t* cntx )
|
||||
// initialized (unlikely), we emulate the body of bli_mem_init().
|
||||
if ( bli_mem_is_init == FALSE )
|
||||
{
|
||||
// Initialize the memory pools.
|
||||
bli_mem_init_pools( cntx );
|
||||
// Initialize the global membrk_t object and its memory pools.
|
||||
bli_membrk_init( cntx, &global_membrk );
|
||||
|
||||
// After initialization, mark the API as initialized.
|
||||
bli_mem_is_init = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Reinitialize the memory pools.
|
||||
bli_mem_reinit_pools( cntx );
|
||||
// Reinitialize the global membrk_t object's memory pools.
|
||||
bli_membrk_reinit_pools( cntx, &global_membrk );
|
||||
}
|
||||
}
|
||||
// END CRITICAL SECTION
|
||||
@@ -373,8 +182,8 @@ void bli_mem_finalize( void )
|
||||
// critical section.
|
||||
if ( bli_mem_is_init == TRUE )
|
||||
{
|
||||
// Finalize the memory pools.
|
||||
bli_mem_finalize_pools();
|
||||
// Finalize the global membrk_t object and its memory pools.
|
||||
bli_membrk_finalize( &global_membrk );
|
||||
|
||||
// After finalization, mark the API as uninitialized.
|
||||
bli_mem_is_init = FALSE;
|
||||
@@ -392,275 +201,3 @@ bool_t bli_mem_is_initialized( void )
|
||||
return bli_mem_is_init;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_init_pools( cntx_t* cntx )
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = &pools[ index_a ];
|
||||
pool_t* pool_b = &pools[ index_b ];
|
||||
pool_t* pool_c = &pools[ index_c ];
|
||||
|
||||
// Start with empty pools.
|
||||
const dim_t num_blocks_a = 0;
|
||||
const dim_t num_blocks_b = 0;
|
||||
const dim_t num_blocks_c = 0;
|
||||
|
||||
siz_t block_size_a = 0;
|
||||
siz_t block_size_b = 0;
|
||||
siz_t block_size_c = 0;
|
||||
|
||||
// Determine the block size for each memory pool.
|
||||
bli_mem_compute_pool_block_sizes( &block_size_a,
|
||||
&block_size_b,
|
||||
&block_size_c,
|
||||
cntx );
|
||||
|
||||
// Initialize the memory pools for A, B, and C.
|
||||
bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a );
|
||||
bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b );
|
||||
bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c );
|
||||
}
|
||||
|
||||
void bli_mem_reinit_pools( cntx_t* cntx )
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = &pools[ index_a ];
|
||||
pool_t* pool_b = &pools[ index_b ];
|
||||
pool_t* pool_c = &pools[ index_c ];
|
||||
|
||||
// Query the number of blocks currently allocated in each pool.
|
||||
const dim_t num_blocks_a = bli_pool_num_blocks( pool_a );
|
||||
const dim_t num_blocks_b = bli_pool_num_blocks( pool_b );
|
||||
const dim_t num_blocks_c = bli_pool_num_blocks( pool_c );
|
||||
|
||||
siz_t block_size_a_new = 0;
|
||||
siz_t block_size_b_new = 0;
|
||||
siz_t block_size_c_new = 0;
|
||||
|
||||
// Determine the context-implied block size needed for each pool.
|
||||
bli_mem_compute_pool_block_sizes( &block_size_a_new,
|
||||
&block_size_b_new,
|
||||
&block_size_c_new,
|
||||
cntx );
|
||||
|
||||
// Reinitialize the pool, but only if one of the parameters has
|
||||
// changed in such a way that reinitialization would be required.
|
||||
// In this case, the align_size is constant, as is num_blocks, so
|
||||
// what this actually boils down to is that reinitialization of a
|
||||
// pool occurs only if the block size for that pool has increased.
|
||||
bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a );
|
||||
bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b );
|
||||
bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c );
|
||||
}
|
||||
|
||||
void bli_mem_finalize_pools( void )
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = &pools[ index_a ];
|
||||
pool_t* pool_b = &pools[ index_b ];
|
||||
pool_t* pool_c = &pools[ index_c ];
|
||||
|
||||
// Finalize the memory pools for A, B, and C.
|
||||
bli_pool_finalize( pool_a );
|
||||
bli_pool_finalize( pool_b );
|
||||
bli_pool_finalize( pool_c );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_compute_pool_block_sizes( siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
const ind_t im = bli_cntx_get_ind_method( cntx );
|
||||
|
||||
siz_t bs_cand_a = 0;
|
||||
siz_t bs_cand_b = 0;
|
||||
siz_t bs_cand_c = 0;
|
||||
|
||||
num_t dt;
|
||||
|
||||
// Compute pool block sizes for each datatype and find the maximum
|
||||
// size for each pool. This is done so that new pools do not need
|
||||
// to be allocated if the user switches datatypes.
|
||||
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
|
||||
{
|
||||
siz_t bs_dt_a;
|
||||
siz_t bs_dt_b;
|
||||
siz_t bs_dt_c;
|
||||
|
||||
// Avoid considering induced methods for real datatypes.
|
||||
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
|
||||
|
||||
bli_mem_compute_pool_block_sizes_dt( dt,
|
||||
&bs_dt_a,
|
||||
&bs_dt_b,
|
||||
&bs_dt_c,
|
||||
cntx );
|
||||
|
||||
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
|
||||
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
|
||||
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
|
||||
}
|
||||
|
||||
// Save the results.
|
||||
*bs_a = bs_cand_a;
|
||||
*bs_b = bs_cand_b;
|
||||
*bs_c = bs_cand_c;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_compute_pool_block_sizes_dt( num_t dt,
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx )
|
||||
{
|
||||
siz_t size_dt = bli_datatype_size( dt );
|
||||
|
||||
blksz_t* mr;
|
||||
blksz_t* nr;
|
||||
|
||||
blksz_t* mc;
|
||||
blksz_t* kc;
|
||||
blksz_t* nc;
|
||||
|
||||
dim_t mr_dt;
|
||||
dim_t nr_dt;
|
||||
dim_t max_mnr_dt;
|
||||
|
||||
dim_t mc_max_dt;
|
||||
dim_t kc_max_dt;
|
||||
dim_t nc_max_dt;
|
||||
|
||||
dim_t packmr_dt;
|
||||
dim_t packnr_dt;
|
||||
dim_t max_packmnr_dt;
|
||||
|
||||
dim_t scale_num_dt;
|
||||
dim_t scale_den_dt;
|
||||
|
||||
dim_t pool_mc_dt, left_mc_dt;
|
||||
dim_t pool_nc_dt, left_nc_dt;
|
||||
dim_t pool_kc_dt;
|
||||
|
||||
//
|
||||
// Find the larger of the two register blocksizes.
|
||||
//
|
||||
|
||||
// Query the mr and nr blksz_t objects for the given method of
|
||||
// execution.
|
||||
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
|
||||
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
|
||||
|
||||
// Extract the mr and nr values specific to the current datatype.
|
||||
mr_dt = bli_blksz_get_def( dt, mr );
|
||||
nr_dt = bli_blksz_get_def( dt, nr );
|
||||
|
||||
// Find the maximum of mr and nr.
|
||||
max_mnr_dt = bli_max( mr_dt, nr_dt );
|
||||
|
||||
//
|
||||
// Define local maximum cache blocksizes.
|
||||
//
|
||||
|
||||
// Query the mc, kc, and nc blksz_t objects for native execution.
|
||||
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
|
||||
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
|
||||
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
|
||||
|
||||
// Extract the maximum mc, kc, and nc values specific to the current
|
||||
// datatype.
|
||||
mc_max_dt = bli_blksz_get_max( dt, mc );
|
||||
kc_max_dt = bli_blksz_get_max( dt, kc );
|
||||
nc_max_dt = bli_blksz_get_max( dt, nc );
|
||||
|
||||
// Add max(mr,nr) to kc to make room for the nudging of kc at
|
||||
// runtime to be a multiple of mr or nr for triangular operations
|
||||
// trmm, trmm3, and trsm.
|
||||
kc_max_dt += max_mnr_dt;
|
||||
|
||||
//
|
||||
// Compute scaling factors.
|
||||
//
|
||||
|
||||
// Compute integer scaling factors (numerator and denominator) used
|
||||
// to account for situations when the packing register blocksizes are
|
||||
// larger than the regular register blocksizes.
|
||||
|
||||
// In order to compute the scaling factors, we first have to determine
|
||||
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
|
||||
// needed ONLY because the amount of space allocated for a block of A
|
||||
// and a panel of B needs to be such that MR and NR can be swapped (ie:
|
||||
// A is packed with NR and B is packed with MR). This transformation is
|
||||
// needed for right-side trsm when inducing an algorithm that (a) has
|
||||
// favorable access patterns for column-stored C and (b) allows the
|
||||
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
|
||||
// kernels. We avoid integer division by cross-multiplying:
|
||||
//
|
||||
// ( packmr / mr ) >= ( packnr / nr )
|
||||
// ( packmr / mr ) * nr >= packnr
|
||||
// packmr * nr >= packnr * mr
|
||||
//
|
||||
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
|
||||
// our scaling factors. Otherwise, we'll use packnr and nr.
|
||||
|
||||
packmr_dt = bli_blksz_get_max( dt, mr );
|
||||
packnr_dt = bli_blksz_get_max( dt, nr );
|
||||
|
||||
if ( packmr_dt * nr_dt >=
|
||||
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
|
||||
scale_den_dt = mr_dt; }
|
||||
else { scale_num_dt = packnr_dt;
|
||||
scale_den_dt = nr_dt; }
|
||||
|
||||
//
|
||||
// Compute pool block dimensions.
|
||||
//
|
||||
|
||||
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
|
||||
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
|
||||
|
||||
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
|
||||
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
|
||||
|
||||
pool_kc_dt = ( kc_max_dt );
|
||||
|
||||
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
|
||||
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
|
||||
|
||||
//
|
||||
// Compute pool block sizes
|
||||
//
|
||||
|
||||
// We add an extra micro-panel of space to the block sizes for A and B
|
||||
// just to be sure any pre-loading performed by the micro-kernel does
|
||||
// not cause a segmentation fault.
|
||||
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
|
||||
|
||||
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
||||
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
||||
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -32,37 +33,21 @@
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MEM_H
|
||||
#define BLIS_MEM_H
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
membrk_t* bli_mem_global_membrk( void );
|
||||
siz_t bli_mem_pool_size( packbuf_t buf_type );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_init( void );
|
||||
void bli_mem_reinit( cntx_t* cntx );
|
||||
void bli_mem_finalize( void );
|
||||
bool_t bli_mem_is_initialized( void );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_acquire_m( siz_t req_size,
|
||||
packbuf_t buf_type,
|
||||
mem_t* mem );
|
||||
|
||||
void bli_mem_acquire_v( siz_t req_size,
|
||||
mem_t* mem );
|
||||
|
||||
void bli_mem_release( mem_t* mem );
|
||||
|
||||
siz_t bli_mem_pool_size( packbuf_t buf_type );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_mem_init_pools( cntx_t* cntx );
|
||||
void bli_mem_reinit_pools( cntx_t* cntx );
|
||||
void bli_mem_finalize_pools( void );
|
||||
|
||||
void bli_mem_compute_pool_block_sizes( siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx );
|
||||
void bli_mem_compute_pool_block_sizes_dt( num_t dt,
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx );
|
||||
#endif
|
||||
|
||||
|
||||
578
frame/base/bli_membrk.c
Normal file
578
frame/base/bli_membrk.c
Normal file
@@ -0,0 +1,578 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_membrk_init
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
)
|
||||
{
|
||||
bli_mutex_init( bli_membrk_mutex( membrk ) );
|
||||
bli_membrk_init_pools( cntx, membrk );
|
||||
bli_membrk_set_malloc_fp( bli_malloc_pool, membrk );
|
||||
}
|
||||
|
||||
void bli_membrk_finalize
|
||||
(
|
||||
membrk_t* membrk
|
||||
)
|
||||
{
|
||||
bli_membrk_set_malloc_fp( NULL, membrk );
|
||||
bli_membrk_finalize_pools( membrk );
|
||||
bli_mutex_finalize( bli_membrk_mutex( membrk ) );
|
||||
}
|
||||
|
||||
void bli_membrk_acquire_m
|
||||
(
|
||||
membrk_t* membrk,
|
||||
siz_t req_size,
|
||||
packbuf_t buf_type,
|
||||
mem_t* mem
|
||||
)
|
||||
{
|
||||
pool_t* pool;
|
||||
pblk_t* pblk;
|
||||
dim_t pi;
|
||||
siz_t block_size;
|
||||
|
||||
// Make sure the API is initialized.
|
||||
//assert( membrk ); //??
|
||||
|
||||
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
||||
{
|
||||
// For general-use buffer requests, such as those used by level-2
|
||||
// operations, dynamically allocating memory is sufficient.
|
||||
// Note that we use the malloc()-style memory allocation function
|
||||
// that is stored in the membrk_t object.
|
||||
void* buf_sys = bli_membrk_malloc( req_size, membrk );
|
||||
|
||||
// Initialize the mem_t object with:
|
||||
// - the address of the memory block,
|
||||
// - the buffer type (a packbuf_t value),
|
||||
// - the size of the requested region,
|
||||
// - the membrk_t from which the mem_t entry was acquired.
|
||||
// NOTE: We do not initialize the pool field since this block did not
|
||||
// come from a memory pool.
|
||||
bli_mem_set_buffer( buf_sys, mem );
|
||||
bli_mem_set_buf_sys( buf_sys, mem );
|
||||
bli_mem_set_buf_type( buf_type, mem );
|
||||
bli_mem_set_size( req_size, mem );
|
||||
bli_mem_set_membrk( membrk, mem );
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles cases where the memory block needs to come
|
||||
// from an internal memory pool, in which blocks are allocated once
|
||||
// and then recycled.
|
||||
|
||||
// Map the requested packed buffer type to a zero-based index, which
|
||||
// we then use to select the corresponding memory pool.
|
||||
pi = bli_packbuf_index( buf_type );
|
||||
pool = bli_membrk_pool( pi, membrk );
|
||||
|
||||
// Unconditionally perform error checking on the memory pool.
|
||||
{
|
||||
err_t e_val;
|
||||
|
||||
// Make sure that the requested matrix size fits inside of a block
|
||||
// of the corresponding pool. If it does not, the pool was somehow
|
||||
// initialized improperly.
|
||||
e_val = bli_check_requested_block_size_for_pool( req_size, pool );
|
||||
bli_check_error_code( e_val );
|
||||
}
|
||||
|
||||
// Extract the address of the pblk_t struct within the mem_t.
|
||||
pblk = bli_mem_pblk( mem );
|
||||
|
||||
// BEGIN CRITICAL SECTION
|
||||
bli_membrk_lock( membrk );
|
||||
{
|
||||
|
||||
// Checkout a block from the pool. If the pool is exhausted,
|
||||
// either because it is still empty or because all blocks have
|
||||
// been checked out already, additional blocks will be allocated
|
||||
// automatically, as-needed. Note that the addresses are stored
|
||||
// directly into the mem_t struct since pblk is the address of
|
||||
// the struct's pblk_t field.
|
||||
bli_pool_checkout_block( pblk, pool );
|
||||
|
||||
// Query the size of the blocks in the pool so we can store it in
|
||||
// the mem_t object. At this point, it is guaranteed to be at
|
||||
// least as large as req_size. (NOTE: We must perform the query
|
||||
// within the critical section to ensure that the pool hasn't
|
||||
// changed, as unlikely as that would be.)
|
||||
block_size = bli_pool_block_size( pool );
|
||||
|
||||
}
|
||||
bli_membrk_unlock( membrk );
|
||||
// END CRITICAL SECTION
|
||||
|
||||
// Initialize the mem_t object with:
|
||||
// - the buffer type (a packbuf_t value),
|
||||
// - the address of the memory pool to which it belongs,
|
||||
// - the size of the contiguous memory block (NOT the size of the
|
||||
// requested region),
|
||||
// - the membrk_t from which the mem_t entry was acquired.
|
||||
// The actual addresses (system and aligned) are already stored in
|
||||
// the mem_t struct's pblk_t field
|
||||
bli_mem_set_buf_type( buf_type, mem );
|
||||
bli_mem_set_pool( pool, mem );
|
||||
bli_mem_set_size( block_size, mem );
|
||||
bli_mem_set_membrk( membrk, mem );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void bli_membrk_release
|
||||
(
|
||||
mem_t* mem
|
||||
)
|
||||
{
|
||||
packbuf_t buf_type;
|
||||
pool_t* pool;
|
||||
pblk_t* pblk;
|
||||
siz_t block_size_cur;
|
||||
siz_t block_size_prev;
|
||||
membrk_t* membrk;
|
||||
|
||||
// Extract the membrk_t address from the mem_t object.
|
||||
membrk = bli_mem_membrk( mem );
|
||||
|
||||
// Extract the buffer type so we know what kind of memory was allocated.
|
||||
buf_type = bli_mem_buf_type( mem );
|
||||
|
||||
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
||||
{
|
||||
void* buf_sys = bli_mem_buf_sys( mem );
|
||||
|
||||
// For general-use buffers, we dynamically allocate memory, and so
|
||||
// here we need to free.
|
||||
// Note that we use the free()-style memory release function that
|
||||
// is stored in the membrk_t object.
|
||||
bli_membrk_free( buf_sys, membrk );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Extract the address of the pool from which the memory was
|
||||
// allocated.
|
||||
pool = bli_mem_pool( mem );
|
||||
|
||||
// Extract the address of the pblk_t struct within the mem_t struct.
|
||||
pblk = bli_mem_pblk( mem );
|
||||
|
||||
// Query the size of the blocks that were in the pool at the time
|
||||
// the pblk_t was checked out. (This is used below, in the critical
|
||||
// section.)
|
||||
block_size_prev = bli_mem_size( mem );
|
||||
|
||||
// BEGIN CRITICAL SECTION
|
||||
bli_membrk_lock( membrk );
|
||||
{
|
||||
|
||||
// Query the size of the blocks currently in the pool.
|
||||
block_size_cur = bli_pool_block_size( pool );
|
||||
|
||||
// If the block size of the pool has changed since the pblk_t
|
||||
// was checked out, then we need to free the pblk_t rather
|
||||
// than check it back in. Why? Because the pool's block size
|
||||
// has (most likely) increased to meet changing needs (example:
|
||||
// larger cache blocksizes). Thus, the current pblk_t's smaller
|
||||
// allocated size is of no use anymore.
|
||||
if ( block_size_cur != block_size_prev )
|
||||
{
|
||||
// Free the pblk_t using the appropriate function in the
|
||||
// pool API.
|
||||
bli_pool_free_block( pblk );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check the block back into the pool.
|
||||
bli_pool_checkin_block( pblk, pool );
|
||||
}
|
||||
|
||||
}
|
||||
bli_membrk_unlock( membrk );
|
||||
// END CRITICAL SECTION
|
||||
}
|
||||
|
||||
// Clear the mem_t object so that it appears unallocated. This clears:
|
||||
// - the pblk_t struct's fields (ie: the buffer addresses)
|
||||
// - the pool field
|
||||
// - the size field
|
||||
// - the membrk field
|
||||
// NOTE: We do not clear the buf_type field since there is no
|
||||
// "uninitialized" value for packbuf_t.
|
||||
bli_mem_clear( mem );
|
||||
}
|
||||
|
||||
|
||||
void bli_membrk_acquire_v
|
||||
(
|
||||
membrk_t* membrk,
|
||||
siz_t req_size,
|
||||
mem_t* mem
|
||||
)
|
||||
{
|
||||
bli_membrk_acquire_m( membrk,
|
||||
req_size,
|
||||
BLIS_BUFFER_FOR_GEN_USE,
|
||||
mem );
|
||||
}
|
||||
|
||||
|
||||
siz_t bli_membrk_pool_size
|
||||
(
|
||||
membrk_t* membrk,
|
||||
packbuf_t buf_type
|
||||
)
|
||||
{
|
||||
siz_t r_val;
|
||||
|
||||
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
|
||||
{
|
||||
// We don't (yet) track the amount of general-purpose
|
||||
// memory that is currently allocated.
|
||||
r_val = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
dim_t pool_index;
|
||||
pool_t* pool;
|
||||
|
||||
// Acquire the pointer to the pool corresponding to the buf_type
|
||||
// provided.
|
||||
pool_index = bli_packbuf_index( buf_type );
|
||||
pool = bli_membrk_pool( pool_index, membrk );
|
||||
|
||||
// Compute the pool "size" as the product of the block size
|
||||
// and the number of blocks in the pool.
|
||||
r_val = bli_pool_block_size( pool ) *
|
||||
bli_pool_num_blocks( pool );
|
||||
}
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_membrk_init_pools
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
)
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
|
||||
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
|
||||
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
|
||||
|
||||
// Start with empty pools.
|
||||
const dim_t num_blocks_a = 0;
|
||||
const dim_t num_blocks_b = 0;
|
||||
const dim_t num_blocks_c = 0;
|
||||
|
||||
siz_t block_size_a = 0;
|
||||
siz_t block_size_b = 0;
|
||||
siz_t block_size_c = 0;
|
||||
|
||||
// Determine the block size for each memory pool.
|
||||
bli_membrk_compute_pool_block_sizes( &block_size_a,
|
||||
&block_size_b,
|
||||
&block_size_c,
|
||||
cntx );
|
||||
|
||||
// Initialize the memory pools for A, B, and C.
|
||||
bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a );
|
||||
bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b );
|
||||
bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c );
|
||||
}
|
||||
|
||||
void bli_membrk_reinit_pools
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
)
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
|
||||
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
|
||||
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
|
||||
|
||||
// Query the number of blocks currently allocated in each pool.
|
||||
const dim_t num_blocks_a = bli_pool_num_blocks( pool_a );
|
||||
const dim_t num_blocks_b = bli_pool_num_blocks( pool_b );
|
||||
const dim_t num_blocks_c = bli_pool_num_blocks( pool_c );
|
||||
|
||||
siz_t block_size_a_new = 0;
|
||||
siz_t block_size_b_new = 0;
|
||||
siz_t block_size_c_new = 0;
|
||||
|
||||
// Determine the context-implied block size needed for each pool.
|
||||
bli_membrk_compute_pool_block_sizes( &block_size_a_new,
|
||||
&block_size_b_new,
|
||||
&block_size_c_new,
|
||||
cntx );
|
||||
|
||||
// Reinitialize the pool, but only if one of the parameters has
|
||||
// changed in such a way that reinitialization would be required.
|
||||
// In this case, the align_size is constant, as is num_blocks, so
|
||||
// what this actually boils down to is that reinitialization of a
|
||||
// pool occurs only if the block size for that pool has increased.
|
||||
bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a );
|
||||
bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b );
|
||||
bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c );
|
||||
}
|
||||
|
||||
void bli_membrk_finalize_pools
|
||||
(
|
||||
membrk_t* membrk
|
||||
)
|
||||
{
|
||||
// Map each of the packbuf_t values to an index starting at zero.
|
||||
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
|
||||
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
|
||||
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
|
||||
|
||||
// Alias the pool addresses to convenient identifiers.
|
||||
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
|
||||
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
|
||||
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
|
||||
|
||||
// Finalize the memory pools for A, B, and C.
|
||||
bli_pool_finalize( pool_a );
|
||||
bli_pool_finalize( pool_b );
|
||||
bli_pool_finalize( pool_c );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_membrk_compute_pool_block_sizes
|
||||
(
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
const ind_t im = bli_cntx_get_ind_method( cntx );
|
||||
|
||||
siz_t bs_cand_a = 0;
|
||||
siz_t bs_cand_b = 0;
|
||||
siz_t bs_cand_c = 0;
|
||||
|
||||
num_t dt;
|
||||
|
||||
// Compute pool block sizes for each datatype and find the maximum
|
||||
// size for each pool. This is done so that new pools do not need
|
||||
// to be allocated if the user switches datatypes.
|
||||
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
|
||||
{
|
||||
siz_t bs_dt_a;
|
||||
siz_t bs_dt_b;
|
||||
siz_t bs_dt_c;
|
||||
|
||||
// Avoid considering induced methods for real datatypes.
|
||||
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
|
||||
|
||||
bli_membrk_compute_pool_block_sizes_dt( dt,
|
||||
&bs_dt_a,
|
||||
&bs_dt_b,
|
||||
&bs_dt_c,
|
||||
cntx );
|
||||
|
||||
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
|
||||
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
|
||||
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
|
||||
}
|
||||
|
||||
// Save the results.
|
||||
*bs_a = bs_cand_a;
|
||||
*bs_b = bs_cand_b;
|
||||
*bs_c = bs_cand_c;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_membrk_compute_pool_block_sizes_dt
|
||||
(
|
||||
num_t dt,
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
siz_t size_dt = bli_datatype_size( dt );
|
||||
|
||||
blksz_t* mr;
|
||||
blksz_t* nr;
|
||||
|
||||
blksz_t* mc;
|
||||
blksz_t* kc;
|
||||
blksz_t* nc;
|
||||
|
||||
dim_t mr_dt;
|
||||
dim_t nr_dt;
|
||||
dim_t max_mnr_dt;
|
||||
|
||||
dim_t mc_max_dt;
|
||||
dim_t kc_max_dt;
|
||||
dim_t nc_max_dt;
|
||||
|
||||
dim_t packmr_dt;
|
||||
dim_t packnr_dt;
|
||||
dim_t max_packmnr_dt;
|
||||
|
||||
dim_t scale_num_dt;
|
||||
dim_t scale_den_dt;
|
||||
|
||||
dim_t pool_mc_dt, left_mc_dt;
|
||||
dim_t pool_nc_dt, left_nc_dt;
|
||||
dim_t pool_kc_dt;
|
||||
|
||||
//
|
||||
// Find the larger of the two register blocksizes.
|
||||
//
|
||||
|
||||
// Query the mr and nr blksz_t objects for the given method of
|
||||
// execution.
|
||||
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
|
||||
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
|
||||
|
||||
// Extract the mr and nr values specific to the current datatype.
|
||||
mr_dt = bli_blksz_get_def( dt, mr );
|
||||
nr_dt = bli_blksz_get_def( dt, nr );
|
||||
|
||||
// Find the maximum of mr and nr.
|
||||
max_mnr_dt = bli_max( mr_dt, nr_dt );
|
||||
|
||||
//
|
||||
// Define local maximum cache blocksizes.
|
||||
//
|
||||
|
||||
// Query the mc, kc, and nc blksz_t objects for native execution.
|
||||
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
|
||||
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
|
||||
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
|
||||
|
||||
// Extract the maximum mc, kc, and nc values specific to the current
|
||||
// datatype.
|
||||
mc_max_dt = bli_blksz_get_max( dt, mc );
|
||||
kc_max_dt = bli_blksz_get_max( dt, kc );
|
||||
nc_max_dt = bli_blksz_get_max( dt, nc );
|
||||
|
||||
// Add max(mr,nr) to kc to make room for the nudging of kc at
|
||||
// runtime to be a multiple of mr or nr for triangular operations
|
||||
// trmm, trmm3, and trsm.
|
||||
kc_max_dt += max_mnr_dt;
|
||||
|
||||
//
|
||||
// Compute scaling factors.
|
||||
//
|
||||
|
||||
// Compute integer scaling factors (numerator and denominator) used
|
||||
// to account for situations when the packing register blocksizes are
|
||||
// larger than the regular register blocksizes.
|
||||
|
||||
// In order to compute the scaling factors, we first have to determine
|
||||
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
|
||||
// needed ONLY because the amount of space allocated for a block of A
|
||||
// and a panel of B needs to be such that MR and NR can be swapped (ie:
|
||||
// A is packed with NR and B is packed with MR). This transformation is
|
||||
// needed for right-side trsm when inducing an algorithm that (a) has
|
||||
// favorable access patterns for column-stored C and (b) allows the
|
||||
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
|
||||
// kernels. We avoid integer division by cross-multiplying:
|
||||
//
|
||||
// ( packmr / mr ) >= ( packnr / nr )
|
||||
// ( packmr / mr ) * nr >= packnr
|
||||
// packmr * nr >= packnr * mr
|
||||
//
|
||||
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
|
||||
// our scaling factors. Otherwise, we'll use packnr and nr.
|
||||
|
||||
packmr_dt = bli_blksz_get_max( dt, mr );
|
||||
packnr_dt = bli_blksz_get_max( dt, nr );
|
||||
|
||||
if ( packmr_dt * nr_dt >=
|
||||
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
|
||||
scale_den_dt = mr_dt; }
|
||||
else { scale_num_dt = packnr_dt;
|
||||
scale_den_dt = nr_dt; }
|
||||
|
||||
//
|
||||
// Compute pool block dimensions.
|
||||
//
|
||||
|
||||
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
|
||||
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
|
||||
|
||||
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
|
||||
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
|
||||
|
||||
pool_kc_dt = ( kc_max_dt );
|
||||
|
||||
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
|
||||
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
|
||||
|
||||
//
|
||||
// Compute pool block sizes
|
||||
//
|
||||
|
||||
// We add an extra micro-panel of space to the block sizes for A and B
|
||||
// just to be sure any pre-loading performed by the micro-kernel does
|
||||
// not cause a segmentation fault.
|
||||
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
|
||||
|
||||
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
||||
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
|
||||
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
|
||||
}
|
||||
169
frame/base/bli_membrk.h
Normal file
169
frame/base/bli_membrk.h
Normal file
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MEMBRK_H
|
||||
#define BLIS_MEMBRK_H
|
||||
|
||||
// -- Memory broker object type --
|
||||
|
||||
typedef struct membrk_s
|
||||
{
|
||||
pool_t pools[3];
|
||||
mtx_t mutex;
|
||||
|
||||
malloc_ft malloc_fp;
|
||||
free_ft free_fp;
|
||||
} membrk_t;
|
||||
|
||||
#define bli_membrk_pool( pool_index, membrk_p ) \
|
||||
\
|
||||
( (membrk_p)->pools + (pool_index) )
|
||||
|
||||
#define bli_membrk_mutex( membrk_p ) \
|
||||
\
|
||||
( &( (membrk_p)->mutex ) )
|
||||
|
||||
#define bli_membrk_malloc_fp( membrk_p ) \
|
||||
\
|
||||
( (membrk_p)->malloc_fp )
|
||||
|
||||
#define bli_membrk_free_fp( membrk_p ) \
|
||||
\
|
||||
( (membrk_p)->free_fp )
|
||||
|
||||
#define bli_membrk_set_malloc_fp( _malloc_fp, membrk_p ) \
|
||||
{\
|
||||
(membrk_p)->malloc_fp = _malloc_fp; \
|
||||
}
|
||||
|
||||
#define bli_membrk_set_free_fp( _free_fp, membrk_p ) \
|
||||
{\
|
||||
(membrk_p)->free_fp = _free_fp; \
|
||||
}
|
||||
|
||||
#define bli_membrk_lock( membrk_p ) \
|
||||
{\
|
||||
bli_mutex_lock( &((membrk_p)->mutex) ); \
|
||||
}
|
||||
|
||||
#define bli_membrk_unlock( membrk_p ) \
|
||||
{\
|
||||
bli_mutex_unlock( &((membrk_p)->mutex) ); \
|
||||
}
|
||||
|
||||
#define bli_membrk_malloc( size, membrk ) \
|
||||
\
|
||||
/* Call the malloc()-style function in membrk. */ \
|
||||
((membrk)->malloc_fp)( size )
|
||||
|
||||
#define bli_membrk_free( buf_p, membrk ) \
|
||||
\
|
||||
/* Call the free()-style function in membrk. */ \
|
||||
((membrk)->free_fp)( buf_p )
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_membrk_init
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
);
|
||||
void bli_membrk_finalize
|
||||
(
|
||||
membrk_t* membrk
|
||||
);
|
||||
|
||||
void bli_membrk_acquire_m
|
||||
(
|
||||
membrk_t* membrk,
|
||||
siz_t req_size,
|
||||
packbuf_t buf_type,
|
||||
mem_t* mem
|
||||
);
|
||||
|
||||
void bli_membrk_acquire_v
|
||||
(
|
||||
membrk_t* membrk,
|
||||
siz_t req_size,
|
||||
mem_t* mem
|
||||
);
|
||||
|
||||
void bli_membrk_release
|
||||
(
|
||||
mem_t* mem
|
||||
);
|
||||
|
||||
siz_t bli_membrk_pool_size
|
||||
(
|
||||
membrk_t* membrk,
|
||||
packbuf_t buf_type
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_membrk_init_pools
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
);
|
||||
void bli_membrk_reinit_pools
|
||||
(
|
||||
cntx_t* cntx,
|
||||
membrk_t* membrk
|
||||
);
|
||||
void bli_membrk_finalize_pools
|
||||
(
|
||||
membrk_t* membrk
|
||||
);
|
||||
|
||||
void bli_membrk_compute_pool_block_sizes
|
||||
(
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
void bli_membrk_compute_pool_block_sizes_dt
|
||||
(
|
||||
num_t dt,
|
||||
siz_t* bs_a,
|
||||
siz_t* bs_b,
|
||||
siz_t* bs_c,
|
||||
cntx_t* cntx
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -58,6 +59,10 @@
|
||||
\
|
||||
( (mem_p)->pool )
|
||||
|
||||
#define bli_mem_membrk( mem_p ) \
|
||||
\
|
||||
( (mem_p)->membrk )
|
||||
|
||||
#define bli_mem_size( mem_p ) \
|
||||
\
|
||||
( (mem_p)->size )
|
||||
@@ -90,12 +95,17 @@
|
||||
|
||||
#define bli_mem_set_buf_type( buf_type0, mem_p ) \
|
||||
{ \
|
||||
mem_p->buf_type = buf_type0; \
|
||||
(mem_p)->buf_type = buf_type0; \
|
||||
}
|
||||
|
||||
#define bli_mem_set_pool( pool0, mem_p ) \
|
||||
{ \
|
||||
mem_p->pool = pool0; \
|
||||
(mem_p)->pool = pool0; \
|
||||
}
|
||||
|
||||
#define bli_mem_set_membrk( membrk0, mem_p ) \
|
||||
{ \
|
||||
(mem_p)->membrk = membrk0; \
|
||||
}
|
||||
|
||||
#define bli_mem_set_size( size0, mem_p ) \
|
||||
@@ -109,6 +119,7 @@
|
||||
bli_mem_set_buf_sys( NULL, mem_p ); \
|
||||
bli_mem_set_pool( NULL, mem_p ); \
|
||||
bli_mem_set_size( 0, mem_p ); \
|
||||
bli_mem_set_membrk( NULL, mem_p ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -957,14 +958,14 @@ bli_obj_width_stored( obj )
|
||||
}
|
||||
|
||||
|
||||
// Release object's pack (and cast) memory entries back to memory manager
|
||||
// Release object's pack mem_t entries back to memory manager
|
||||
|
||||
#define bli_obj_release_pack( obj_p ) \
|
||||
{ \
|
||||
mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \
|
||||
\
|
||||
if ( bli_mem_is_alloc( pack_mem_ ) ) \
|
||||
bli_mem_release( pack_mem_ ); \
|
||||
bli_membrk_release( pack_mem_ ); \
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -173,7 +174,6 @@ typedef scomplex f77_scomplex;
|
||||
typedef dcomplex f77_dcomplex;
|
||||
|
||||
|
||||
|
||||
//
|
||||
// -- BLIS info bit field offsets ----------------------------------------------
|
||||
//
|
||||
@@ -505,6 +505,10 @@ typedef enum
|
||||
// -- BLIS misc. structure types -----------------------------------------------
|
||||
//
|
||||
|
||||
// -- Mutex type --
|
||||
|
||||
typedef struct mtx_s mtx_t;
|
||||
|
||||
// -- Pool block type --
|
||||
|
||||
typedef struct
|
||||
@@ -527,6 +531,19 @@ typedef struct
|
||||
siz_t align_size;
|
||||
} pool_t;
|
||||
|
||||
// -- Memory broker object type --
|
||||
|
||||
typedef struct membrk_s membrk_t;
|
||||
/*
|
||||
{
|
||||
pool_t pools[3];
|
||||
mtx_t mutex;
|
||||
|
||||
malloc_ft malloc_fp;
|
||||
free_ft free_fp;
|
||||
} membrk_t;
|
||||
*/
|
||||
|
||||
// -- Memory object type --
|
||||
|
||||
typedef struct mem_s
|
||||
@@ -534,6 +551,7 @@ typedef struct mem_s
|
||||
pblk_t pblk;
|
||||
packbuf_t buf_type;
|
||||
pool_t* pool;
|
||||
membrk_t* membrk;
|
||||
siz_t size;
|
||||
} mem_t;
|
||||
|
||||
@@ -910,6 +928,7 @@ typedef struct cntx_s
|
||||
pack_t schema_b;
|
||||
pack_t schema_c;
|
||||
|
||||
membrk_t* membrk;
|
||||
} cntx_t;
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -103,6 +104,7 @@ extern "C" {
|
||||
#include "bli_cntx.h"
|
||||
#include "bli_gks.h"
|
||||
#include "bli_ind.h"
|
||||
#include "bli_membrk.h"
|
||||
#include "bli_pool.h"
|
||||
#include "bli_mem.h"
|
||||
#include "bli_part.h"
|
||||
|
||||
49
frame/thread/bli_mutex.h
Normal file
49
frame/thread/bli_mutex.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MUTEX_H
|
||||
#define BLIS_MUTEX_H
|
||||
|
||||
// Include definitions (mostly mtx_t) specific to the method of
|
||||
// multithreading.
|
||||
#include "bli_mutex_single.h"
|
||||
#include "bli_mutex_openmp.h"
|
||||
#include "bli_mutex_pthreads.h"
|
||||
|
||||
// Thread mutex prototypes.
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
72
frame/thread/bli_mutex_openmp.h
Normal file
72
frame/thread/bli_mutex_openmp.h
Normal file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MUTEX_OPENMP_H
|
||||
#define BLIS_MUTEX_OPENMP_H
|
||||
|
||||
// Define mutex_t for situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
// Define mtx_t.
|
||||
typedef struct mtx_s
|
||||
{
|
||||
omp_lock_t mutex;
|
||||
} mtx_t;
|
||||
|
||||
// Define macros to operate on OpenMP-based mtx_t.
|
||||
#define bli_mutex_init( mtx_p ) \
|
||||
{ \
|
||||
omp_init_lock( mtx_p ); \
|
||||
}
|
||||
#define bli_mutex_finalize( mtx_p ) \
|
||||
{ \
|
||||
omp_destroy_lock( mtx_p ); \
|
||||
}
|
||||
|
||||
#define bli_mutex_lock( mtx_p ) \
|
||||
{ \
|
||||
omp_set_lock( mtx_p ); \
|
||||
}
|
||||
#define bli_mutex_unlock( mtx_p ) \
|
||||
{ \
|
||||
omp_unset_lock( mtx_p ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
72
frame/thread/bli_mutex_pthreads.h
Normal file
72
frame/thread/bli_mutex_pthreads.h
Normal file
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MUTEX_PTHREADS_H
|
||||
#define BLIS_MUTEX_PTHREADS_H
|
||||
|
||||
// Define mutex_t for situations when POSIX multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
// Define mtx_t.
|
||||
typedef struct mtx_s
|
||||
{
|
||||
pthread_mutex_t mutex;
|
||||
} mtx_t;
|
||||
|
||||
// Define macros to operate on pthread-based mtx_t.
|
||||
#define bli_mutex_init( mtx_p ) \
|
||||
{ \
|
||||
pthread_mutex_init( mtx_p ); \
|
||||
}
|
||||
#define bli_mutex_finalize( mtx_p ) \
|
||||
{ \
|
||||
pthread_mutex_destroy( mtx_p ); \
|
||||
}
|
||||
|
||||
#define bli_mutex_lock( mtx_p ) \
|
||||
{ \
|
||||
pthread_mutex_lock( mtx_p ); \
|
||||
}
|
||||
#define bli_mutex_unlock( mtx_p ) \
|
||||
{ \
|
||||
pthread_mutex_unlock( mtx_p ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
65
frame/thread/bli_mutex_single.h
Normal file
65
frame/thread/bli_mutex_single.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_MUTEX_SINGLE_H
|
||||
#define BLIS_MUTEX_SINGLE_H
|
||||
|
||||
// Define mtx_t for situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
// Define mtx_t.
|
||||
typedef struct mtx_s
|
||||
{
|
||||
} mtx_t;
|
||||
|
||||
// Define macros to operate on pthread-based mtx_t.
|
||||
#define bli_mutex_init( mtx_p ) \
|
||||
{ \
|
||||
}
|
||||
#define bli_mutex_finalize( mtx_p ) \
|
||||
{ \
|
||||
}
|
||||
|
||||
#define bli_mutex_lock( mtx_p ) \
|
||||
{ \
|
||||
}
|
||||
#define bli_mutex_unlock( mtx_p ) \
|
||||
{ \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,9 @@
|
||||
#define BLIS_ENABLE_MULTITHREADING
|
||||
#endif
|
||||
|
||||
// Include thread mutex (mtx_t) object definitions and prototypes.
|
||||
#include "bli_mutex.h"
|
||||
|
||||
// Include thread communicator (thrcomm_t) object definitions and prototypes.
|
||||
#include "bli_thrcomm.h"
|
||||
|
||||
|
||||
@@ -63,8 +63,8 @@ void bli_sgemm_opt_8x12(
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -1112,8 +1112,8 @@ void bli_dgemm_opt_6x8(
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -47,8 +47,8 @@ void bli_dgemm_opt_4x4
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -271,6 +271,8 @@ void bli_dgemm_asm_30x8
|
||||
|
||||
int * offsetPtr = &offsets[0];
|
||||
|
||||
uint64_t k64 = k;
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
#endif
|
||||
@@ -288,7 +290,7 @@ void bli_dgemm_asm_30x8
|
||||
vpxord zmm0, zmm0, zmm0
|
||||
vmovaps zmm1, zmm0 //clear out registers
|
||||
vmovaps zmm2, zmm0
|
||||
mov rsi, k //loop index
|
||||
mov rsi, k64 //loop index
|
||||
vmovaps zmm3, zmm0
|
||||
|
||||
mov r11, rs_c //load row stride
|
||||
@@ -312,7 +314,7 @@ void bli_dgemm_asm_30x8
|
||||
mov rcx, c //load address of c for prefetching
|
||||
vmovaps zmm13, zmm0
|
||||
vmovaps zmm14, zmm0
|
||||
mov r8, k
|
||||
mov r8, k64
|
||||
vmovaps zmm15, zmm0
|
||||
|
||||
vmovaps zmm16, zmm0
|
||||
@@ -381,7 +383,7 @@ void bli_dgemm_asm_30x8
|
||||
//Alternate main loop, with no prefetching of C
|
||||
//Used when <= 40 iterations
|
||||
CONSIDER_UNDER_40:
|
||||
mov rsi, k
|
||||
mov rsi, k64
|
||||
test rsi, rsi
|
||||
je POSTACCUM
|
||||
LOOP_UNDER_40:
|
||||
|
||||
@@ -271,6 +271,8 @@ void bli_sgemm_asm_30x16
|
||||
|
||||
int * offsetPtr = &offsets[0];
|
||||
|
||||
uint64_t k64 = k;
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
#endif
|
||||
@@ -288,7 +290,7 @@ void bli_sgemm_asm_30x16
|
||||
vpxord zmm0, zmm0, zmm0
|
||||
vmovaps zmm1, zmm0 //clear out registers
|
||||
vmovaps zmm2, zmm0
|
||||
mov rsi, k //loop index
|
||||
mov rsi, k64 //loop index
|
||||
vmovaps zmm3, zmm0
|
||||
|
||||
mov r11, rs_c //load row stride
|
||||
@@ -312,7 +314,7 @@ void bli_sgemm_asm_30x16
|
||||
mov rcx, c //load address of c for prefetching
|
||||
vmovaps zmm13, zmm0
|
||||
vmovaps zmm14, zmm0
|
||||
mov r8, k
|
||||
mov r8, k64
|
||||
vmovaps zmm15, zmm0
|
||||
|
||||
vmovaps zmm16, zmm0
|
||||
@@ -381,7 +383,7 @@ void bli_sgemm_asm_30x16
|
||||
//Alternate main loop, with no prefetching of C
|
||||
//Used when <= 40 iterations
|
||||
CONSIDER_UNDER_40:
|
||||
mov rsi, k
|
||||
mov rsi, k64
|
||||
test rsi, rsi
|
||||
je POSTACCUM
|
||||
LOOP_UNDER_40:
|
||||
|
||||
@@ -97,8 +97,8 @@ void bli_sgemm_asm_8x8_fma4
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -80,8 +80,8 @@ void bli_sgemm_asm_6x16
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
//void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -322,23 +322,6 @@ void bli_sgemm_asm_6x16
|
||||
"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // determine if
|
||||
" \n\t" // c % 32 == 0, AND
|
||||
" \n\t" // 4*rs_c % 32 == 0, AND
|
||||
" \n\t" // cs_c == 1
|
||||
" \n\t" // ie: aligned, ldim aligned, and
|
||||
" \n\t" // row-stored
|
||||
" \n\t"
|
||||
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
|
||||
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
|
||||
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
|
||||
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
|
||||
"testq $31, %%rdi \n\t" // set ZF if (4*rs_c) & 32 is zero.
|
||||
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
|
||||
" \n\t" // and(bl,bh) followed by
|
||||
" \n\t" // and(bh,al) will reveal result
|
||||
" \n\t"
|
||||
" \n\t" // now avoid loading C if beta == 0
|
||||
" \n\t"
|
||||
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
|
||||
@@ -346,10 +329,8 @@ void bli_sgemm_asm_6x16
|
||||
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/row-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .SROWSTORED \n\t" // jump to row storage case
|
||||
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
|
||||
"jz .SROWSTORED \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -439,63 +420,51 @@ void bli_sgemm_asm_6x16
|
||||
".SROWSTORED: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm7, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm9, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm13, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213ps %%ymm15, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -505,10 +474,9 @@ void bli_sgemm_asm_6x16
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SBETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/row-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .SROWSTORBZ \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
|
||||
"jz .SROWSTORBZ \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -586,38 +554,38 @@ void bli_sgemm_asm_6x16
|
||||
".SROWSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, (%%rcx) \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm5, (%%rdx) \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm6, (%%rcx) \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm7, (%%rdx) \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, (%%rcx) \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm9, (%%rdx) \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, (%%rcx) \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm11, (%%rdx) \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, (%%rcx) \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm13, (%%rdx) \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, (%%rcx) \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm15, (%%rdx) \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -693,8 +661,8 @@ void bli_dgemm_asm_6x8
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
//void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -935,23 +903,6 @@ void bli_dgemm_asm_6x8
|
||||
//"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // determine if
|
||||
" \n\t" // c % 32 == 0, AND
|
||||
" \n\t" // 8*rs_c % 32 == 0, AND
|
||||
" \n\t" // cs_c == 1
|
||||
" \n\t" // ie: aligned, ldim aligned, and
|
||||
" \n\t" // row-stored
|
||||
" \n\t"
|
||||
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
|
||||
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
|
||||
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
|
||||
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
|
||||
"testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero.
|
||||
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
|
||||
" \n\t" // and(bl,bh) followed by
|
||||
" \n\t" // and(bh,al) will reveal result
|
||||
" \n\t"
|
||||
" \n\t" // now avoid loading C if beta == 0
|
||||
" \n\t"
|
||||
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
|
||||
@@ -959,10 +910,8 @@ void bli_dgemm_asm_6x8
|
||||
"je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/row-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .DROWSTORED \n\t" // jump to row storage case
|
||||
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
|
||||
"jz .DROWSTORED \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1050,63 +999,51 @@ void bli_dgemm_asm_6x8
|
||||
".DROWSTORED: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm7, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm9, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm13, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t"
|
||||
"vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t"
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t"
|
||||
"vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps (%%rdx), %%ymm1 \n\t"
|
||||
"vfmadd213pd %%ymm15, %%ymm3, %%ymm1 \n\t"
|
||||
"vmovaps %%ymm1, (%%rdx) \n\t"
|
||||
"vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1116,10 +1053,9 @@ void bli_dgemm_asm_6x8
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".DBETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/row-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .DROWSTORBZ \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
|
||||
"jz .DROWSTORBZ \n\t" // jump to row storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
@@ -1195,38 +1131,38 @@ void bli_dgemm_asm_6x8
|
||||
".DROWSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm4, (%%rcx) \n\t"
|
||||
"vmovups %%ymm4, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm5, (%%rdx) \n\t"
|
||||
"vmovups %%ymm5, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm6, (%%rcx) \n\t"
|
||||
"vmovups %%ymm6, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm7, (%%rdx) \n\t"
|
||||
"vmovups %%ymm7, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, (%%rcx) \n\t"
|
||||
"vmovups %%ymm8, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm9, (%%rdx) \n\t"
|
||||
"vmovups %%ymm9, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, (%%rcx) \n\t"
|
||||
"vmovups %%ymm10, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm11, (%%rdx) \n\t"
|
||||
"vmovups %%ymm11, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, (%%rcx) \n\t"
|
||||
"vmovups %%ymm12, (%%rcx) \n\t"
|
||||
"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm13, (%%rdx) \n\t"
|
||||
"vmovups %%ymm13, (%%rdx) \n\t"
|
||||
"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, (%%rcx) \n\t"
|
||||
"vmovups %%ymm14, (%%rcx) \n\t"
|
||||
//"addq %%rdi, %%rcx \n\t"
|
||||
"vmovaps %%ymm15, (%%rdx) \n\t"
|
||||
"vmovups %%ymm15, (%%rdx) \n\t"
|
||||
//"addq %%rdi, %%rdx \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
|
||||
@@ -49,8 +49,8 @@ void bli_sgemm_asm_8x4
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -851,8 +851,8 @@ void bli_dgemm_asm_4x4
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -66,8 +66,8 @@ void bli_dgemmtrsm_l_asm_4x4
|
||||
{
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -66,8 +66,8 @@ void bli_dgemmtrsm_u_asm_4x4
|
||||
{
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
|
||||
__asm__ volatile
|
||||
|
||||
@@ -52,8 +52,8 @@ void bli_sgemm_asm_16x3
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 8;
|
||||
dim_t k_left = k % 8;
|
||||
uint64_t k_iter = k / 8;
|
||||
uint64_t k_left = k % 8;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -52,8 +52,8 @@ void bli_sgemm_asm_8x8
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
//void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -1052,8 +1052,8 @@ void bli_dgemm_asm_8x4
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -1739,8 +1739,8 @@ void bli_cgemm_asm_8x4
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -2715,8 +2715,8 @@ void bli_zgemm_asm_4x4
|
||||
//void* a_next = bli_auxinfo_next_a( data );
|
||||
//void* b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
uint64_t k_iter = k / 4;
|
||||
uint64_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
Reference in New Issue
Block a user