Merge master code as on 2016_07_25 to amd-staging branch by praveeng

Change-Id: I84886ae241db2aac0bef6b7ef399f04aa8bca16d
This commit is contained in:
praveeng
2016-07-25 17:01:20 +05:30
30 changed files with 1281 additions and 746 deletions

View File

@@ -2,7 +2,7 @@
[![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis)
Introduction....
Introduction
------------
BLIS is a portable software framework for instantiating high-performance

View File

@@ -170,8 +170,8 @@ ifeq ($(THREADING_MODEL),auto)
THREADING_MODEL := omp
endif
ifeq ($(THREADING_MODEL),omp)
CTHREADFLAGS := -openmp
LDFLAGS += -openmp
CTHREADFLAGS := -fopenmp
LDFLAGS += -fopenmp
endif
ifeq ($(THREADING_MODEL),pthreads)
CTHREADFLAGS := -pthread

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -43,6 +44,7 @@ void bli_packv_init
)
{
// The purpose of packm_init() is to initialize an object P so that
// a source object A can be packed into P via one of the packv
// implementations. This initialization includes acquiring a suitable
// block of memory from the memory allocator, if such a block of memory
@@ -132,15 +134,17 @@ void bli_packv_init_pack
cntx_t* cntx
)
{
num_t dt = bli_obj_datatype( *c );
dim_t dim_c = bli_obj_vector_dim( *c );
dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );
num_t dt = bli_obj_datatype( *c );
dim_t dim_c = bli_obj_vector_dim( *c );
dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );
mem_t* mem_p;
dim_t m_p_pad;
siz_t size_p;
inc_t rs_p, cs_p;
void* buf;
membrk_t* membrk = bli_cntx_membrk( cntx );
mem_t* mem_p;
dim_t m_p_pad;
siz_t size_p;
inc_t rs_p, cs_p;
void* buf;
// We begin by copying the basic fields of c.
@@ -170,8 +174,9 @@ void bli_packv_init_pack
{
// If the mem_t object of p has not yet been allocated, then acquire
// a memory block suitable for a vector.
bli_mem_acquire_v( size_p,
mem_p );
bli_membrk_acquire_v( membrk,
size_p,
mem_p );
}
else
{
@@ -179,10 +184,11 @@ void bli_packv_init_pack
// re-acquire the memory so there is sufficient space.
if ( bli_mem_size( mem_p ) < size_p )
{
bli_mem_release( mem_p );
bli_membrk_release( mem_p );
bli_mem_acquire_v( size_p,
mem_p );
bli_membrk_acquire_v( membrk,
size_p,
mem_p );
}
}
@@ -218,7 +224,7 @@ void bli_packv_release
)
{
if ( !bli_cntl_is_noop( cntl ) )
bli_obj_release_pack( p );
bli_obj_release_pack( p );
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -49,6 +50,9 @@ void bli_packm_cntx_init( cntx_t* cntx )
bli_gks_cntx_set_l1v_ker( BLIS_SCALV_KER, cntx );
bli_gks_cntx_set_l1v_ker( BLIS_SCAL2V_KER, cntx );
bli_gks_cntx_set_l1v_ker( BLIS_SETV_KER, cntx );
// Initialize the context with the global membrk object.
bli_cntx_set_membrk( bli_mem_global_membrk(), cntx );
}
void bli_packm_cntx_finalize( cntx_t* cntx )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -202,23 +203,25 @@ void bli_packm_init_pack( invdiag_t invert_diag,
obj_t* p,
cntx_t* cntx )
{
num_t dt = bli_obj_datatype( *c );
trans_t transc = bli_obj_onlytrans_status( *c );
dim_t m_c = bli_obj_length( *c );
dim_t n_c = bli_obj_width( *c );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx );
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx );
num_t dt = bli_obj_datatype( *c );
trans_t transc = bli_obj_onlytrans_status( *c );
dim_t m_c = bli_obj_length( *c );
dim_t n_c = bli_obj_width( *c );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt, bmult_id_n, cntx );
dim_t bmult_n_pack = bli_cntx_get_blksz_max_dt( dt, bmult_id_n, cntx );
mem_t* mem_p;
dim_t m_p, n_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
inc_t rs_p, cs_p;
inc_t is_p;
void* buf;
membrk_t* membrk = bli_cntx_get_membrk( cntx );
mem_t* mem_p;
dim_t m_p, n_p;
dim_t m_p_pad, n_p_pad;
siz_t size_p;
siz_t elem_size_p;
inc_t rs_p, cs_p;
inc_t is_p;
void* buf;
// We begin by copying the basic fields of c. We do NOT copy the
@@ -549,9 +552,10 @@ void bli_packm_init_pack( invdiag_t invert_diag,
{
// If the mem_t object of p has not yet been allocated, then acquire
// a memory block of type pack_buf_type.
bli_mem_acquire_m( size_p,
pack_buf_type,
mem_p );
bli_membrk_acquire_m( membrk,
size_p,
pack_buf_type,
mem_p );
}
else
{
@@ -562,10 +566,11 @@ void bli_packm_init_pack( invdiag_t invert_diag,
// pack_buf_type value.
if ( bli_mem_size( mem_p ) < size_p )
{
bli_mem_release( mem_p );
bli_mem_acquire_m( size_p,
pack_buf_type,
mem_p );
bli_membrk_release( mem_p );
bli_membrk_acquire_m( membrk,
size_p,
pack_buf_type,
mem_p );
}
}
@@ -582,7 +587,7 @@ void bli_packm_release( obj_t* p,
packm_t* cntl )
{
if ( !bli_cntl_is_noop( cntl ) )
bli_obj_release_pack( p );
bli_obj_release_pack( p );
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -57,6 +58,7 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
membrk_t* membrk;
} cntx_t;
*/
@@ -116,66 +118,75 @@ typedef struct cntx_s
\
( (cntx)->schema_c )
#define bli_cntx_membrk( cntx ) \
\
( (cntx)->membrk )
// cntx_t modification (fields only)
#define bli_cntx_set_blkszs_buf( _blkszs, cntx_p ) \
{ \
(cntx_p)->blkszs = _blkszs; \
(cntx_p)->blkszs = _blkszs; \
}
#define bli_cntx_set_bmults_buf( _bmults, cntx_p ) \
{ \
(cntx_p)->bmults = _bmults; \
(cntx_p)->bmults = _bmults; \
}
#define bli_cntx_set_l3_vir_ukrs_buf( _l3_vir_ukrs, cntx_p ) \
{ \
(cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \
(cntx_p)->l3_vir_ukrs = _l3_vir_ukrs; \
}
#define bli_cntx_set_l3_nat_ukrs_buf( _l3_nat_ukrs, cntx_p ) \
{ \
(cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \
(cntx_p)->l3_nat_ukrs = _l3_nat_ukrs; \
}
#define bli_cntx_set_l3_nat_ukrs_prefs_buf( _l3_nat_ukrs_prefs, cntx_p ) \
{ \
(cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \
(cntx_p)->l3_nat_ukrs_prefs = _l3_nat_ukrs_prefs; \
}
#define bli_cntx_set_l1f_kers_buf( _l1f_kers, cntx_p ) \
{ \
(cntx_p)->l1f_kers = _l1f_kers; \
(cntx_p)->l1f_kers = _l1f_kers; \
}
#define bli_cntx_set_l1v_kers_buf( _l1v_kers, cntx_p ) \
{ \
(cntx_p)->l1v_kers = _l1v_kers; \
(cntx_p)->l1v_kers = _l1v_kers; \
}
#define bli_cntx_set_packm_ukrs( _packm_ukrs, cntx_p ) \
{ \
(cntx_p)->packm_ukrs = _packm_ukrs; \
(cntx_p)->packm_ukrs = _packm_ukrs; \
}
#define bli_cntx_set_method( _method, cntx_p ) \
{ \
(cntx_p)->method = _method; \
(cntx_p)->method = _method; \
}
#define bli_cntx_set_schema_a( _schema_a, cntx_p ) \
{ \
(cntx_p)->schema_a = _schema_a; \
(cntx_p)->schema_a = _schema_a; \
}
#define bli_cntx_set_schema_b( _schema_b, cntx_p ) \
{ \
(cntx_p)->schema_b = _schema_b; \
(cntx_p)->schema_b = _schema_b; \
}
#define bli_cntx_set_schema_c( _schema_c, cntx_p ) \
{ \
(cntx_p)->schema_c = _schema_c; \
(cntx_p)->schema_c = _schema_c; \
}
#define bli_cntx_set_membrk( _membrk, cntx_p ) \
{ \
(cntx_p)->membrk = _membrk; \
}
// cntx_t query (complex)
@@ -264,6 +275,11 @@ typedef struct cntx_s
\
bli_cntx_schema_b( cntx )
#define bli_cntx_get_membrk( cntx ) \
\
bli_cntx_membrk( cntx )
// -----------------------------------------------------------------------------

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -38,207 +39,15 @@
pthread_mutex_t mem_manager_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif
// Declare one memory pool structure for each block size/shape we want to
// be able to allocate.
static pool_t pools[3];
static membrk_t global_membrk;
// -----------------------------------------------------------------------------
void bli_mem_acquire_m( siz_t req_size,
packbuf_t buf_type,
mem_t* mem )
membrk_t* bli_mem_global_membrk( void )
{
pool_t* pool;
pblk_t* pblk;
dim_t pi;
siz_t block_size;
// Make sure the API is initialized.
bli_mem_init();
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// For general-use buffer requests, such as those used by level-2
// operations, dynamically allocating memory is sufficient.
void* buf_sys = bli_malloc_pool( req_size );
// Initialize the mem_t object with:
// - the address of the memory block,
// - the buffer type (a packbuf_t value), and
// - the size of the requested region.
// NOTE: We do not initialize the pool field since this block did not
// come from a memory pool.
bli_mem_set_buffer( buf_sys, mem );
bli_mem_set_buf_sys( buf_sys, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_size( req_size, mem );
}
else
{
// This branch handles cases where the memory block needs to come
// from an internal memory pool, in which blocks are allocated once
// and then recycled.
// Map the requested packed buffer type to a zero-based index, which
// we then use to select the corresponding memory pool.
pi = bli_packbuf_index( buf_type );
pool = &pools[ pi ];
// Unconditionally perform error checking on the memory pool.
{
err_t e_val;
// Make sure that the requested matrix size fits inside of a block
// of the corresponding pool. If it does not, the pool was somehow
// initialized improperly.
e_val = bli_check_requested_block_size_for_pool( req_size, pool );
bli_check_error_code( e_val );
}
// Extract the address of the pblk_t struct within the mem_t.
pblk = bli_mem_pblk( mem );
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
// BEGIN CRITICAL SECTION
{
// Checkout a block from the pool. If the pool is exhausted,
// either because it is still empty or because all blocks have
// been checked out already, additional blocks will be allocated
// automatically, as-needed. Note that the addresses are stored
// directly into the mem_t struct since pblk is the address of
// the struct's pblk_t field.
bli_pool_checkout_block( pblk, pool );
// Query the size of the blocks in the pool so we can store it in
// the mem_t object. At this point, it is guaranteed to be at
// least as large as req_size. (NOTE: We must perform the query
// within the critical section to ensure that the pool hasn't
// changed, as unlikely as that would be.)
block_size = bli_pool_block_size( pool );
}
// END CRITICAL SECTION
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
// Initialize the mem_t object with:
// - the buffer type (a packbuf_t value),
// - the address of the memory pool to which it belongs, and
// - the size of the contiguous memory block (NOT the size of the
// requested region).
// The actual addresses (system and aligned) are already stored in
// the mem_t struct's pblk_t field
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_pool( pool, mem );
bli_mem_set_size( block_size, mem );
}
return &global_membrk;
}
void bli_mem_release( mem_t* mem )
{
packbuf_t buf_type;
pool_t* pool;
pblk_t* pblk;
siz_t block_size_cur;
siz_t block_size_prev;
// Make sure the API is initialized.
bli_mem_init();
// Extract the buffer type so we know what kind of memory was allocated.
buf_type = bli_mem_buf_type( mem );
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
void* buf_sys = bli_mem_buf_sys( mem );
// For general-use buffers, we dynamically allocate memory, and so
// here we need to free.
bli_free_pool( buf_sys );
}
else
{
// Extract the address of the pool from which the memory was
// allocated.
pool = bli_mem_pool( mem );
// Extract the address of the pblk_t struct within the mem_t struct.
pblk = bli_mem_pblk( mem );
// Query the size of the blocks that were in the pool at the time
// the pblk_t was checked out. (This is used below, in the critical
// section.)
block_size_prev = bli_mem_size( mem );
#ifdef BLIS_ENABLE_OPENMP
_Pragma( "omp critical (mem)" )
#endif
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_lock( &mem_manager_mutex );
#endif
// BEGIN CRITICAL SECTION
{
// Query the size of the blocks currently in the pool.
block_size_cur = bli_pool_block_size( pool );
// If the block size of the pool has changed since the pblk_t
// was checked out, then we need to free the pblk_t rather
// than check it back in. Why? Because the pool's block size
// has (most likely) increased to meet changing needs (example:
// larger cache blocksizes). Thus, the current pblk_t's smaller
// allocated size is of no use anymore.
if ( block_size_cur != block_size_prev )
{
// Free the pblk_t using the appropriate function in the
// pool API.
bli_pool_free_block( pblk );
}
else
{
// Check the block back into the pool.
bli_pool_checkin_block( pblk, pool );
}
}
// END CRITICAL SECTION
#ifdef BLIS_ENABLE_PTHREADS
pthread_mutex_unlock( &mem_manager_mutex );
#endif
}
// Clear the mem_t object so that it appears unallocated. This clears:
// - the pblk_t struct's fields (ie: the buffer addresses)
// - the pool field
// - the size field
// NOTE: We do not clear the buf_type field since there is no
// "uninitialized" value for packbuf_t.
bli_mem_clear( mem );
}
void bli_mem_acquire_v( siz_t req_size,
mem_t* mem )
{
bli_mem_acquire_m( req_size,
BLIS_BUFFER_FOR_GEN_USE,
mem );
}
siz_t bli_mem_pool_size( packbuf_t buf_type )
{
siz_t r_val;
@@ -251,15 +60,15 @@ siz_t bli_mem_pool_size( packbuf_t buf_type )
}
else
{
dim_t index;
dim_t pool_index;
pool_t* pool;
// Acquire the pointer to the pool corresponding to the buf_type
// provided.
index = bli_packbuf_index( buf_type );
pool = &(pools[index]);
pool_index = bli_packbuf_index( buf_type );
pool = bli_membrk_pool( pool_index, &global_membrk );
// Compute the pool "size" as the product of the block size
// Compute the pool "size" as the product of the block size
// and the number of blocks in the pool.
r_val = bli_pool_block_size( pool ) *
bli_pool_num_blocks( pool );
@@ -300,8 +109,8 @@ void bli_mem_init( void )
// critical section.
if ( bli_mem_is_init == FALSE )
{
// Initialize the memory pools.
bli_mem_init_pools( &cntx );
// Initialize the global membrk_t object and its memory pools.
bli_membrk_init( &cntx, &global_membrk );
// After initialization, mark the API as initialized.
bli_mem_is_init = TRUE;
@@ -332,16 +141,16 @@ void bli_mem_reinit( cntx_t* cntx )
// initialized (unlikely), we emulate the body of bli_mem_init().
if ( bli_mem_is_init == FALSE )
{
// Initialize the memory pools.
bli_mem_init_pools( cntx );
// Initialize the global membrk_t object and its memory pools.
bli_membrk_init( cntx, &global_membrk );
// After initialization, mark the API as initialized.
bli_mem_is_init = TRUE;
}
else
{
// Reinitialize the memory pools.
bli_mem_reinit_pools( cntx );
// Reinitialize the global membrk_t object's memory pools.
bli_membrk_reinit_pools( cntx, &global_membrk );
}
}
// END CRITICAL SECTION
@@ -373,8 +182,8 @@ void bli_mem_finalize( void )
// critical section.
if ( bli_mem_is_init == TRUE )
{
// Finalize the memory pools.
bli_mem_finalize_pools();
// Finalize the global membrk_t object and its memory pools.
bli_membrk_finalize( &global_membrk );
// After finalization, mark the API as uninitialized.
bli_mem_is_init = FALSE;
@@ -392,275 +201,3 @@ bool_t bli_mem_is_initialized( void )
return bli_mem_is_init;
}
// -----------------------------------------------------------------------------
void bli_mem_init_pools( cntx_t* cntx )
{
// Map each of the packbuf_t values to an index starting at zero.
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = &pools[ index_a ];
pool_t* pool_b = &pools[ index_b ];
pool_t* pool_c = &pools[ index_c ];
// Start with empty pools.
const dim_t num_blocks_a = 0;
const dim_t num_blocks_b = 0;
const dim_t num_blocks_c = 0;
siz_t block_size_a = 0;
siz_t block_size_b = 0;
siz_t block_size_c = 0;
// Determine the block size for each memory pool.
bli_mem_compute_pool_block_sizes( &block_size_a,
&block_size_b,
&block_size_c,
cntx );
// Initialize the memory pools for A, B, and C.
bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a );
bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b );
bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c );
}
void bli_mem_reinit_pools( cntx_t* cntx )
{
// Map each of the packbuf_t values to an index starting at zero.
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = &pools[ index_a ];
pool_t* pool_b = &pools[ index_b ];
pool_t* pool_c = &pools[ index_c ];
// Query the number of blocks currently allocated in each pool.
const dim_t num_blocks_a = bli_pool_num_blocks( pool_a );
const dim_t num_blocks_b = bli_pool_num_blocks( pool_b );
const dim_t num_blocks_c = bli_pool_num_blocks( pool_c );
siz_t block_size_a_new = 0;
siz_t block_size_b_new = 0;
siz_t block_size_c_new = 0;
// Determine the context-implied block size needed for each pool.
bli_mem_compute_pool_block_sizes( &block_size_a_new,
&block_size_b_new,
&block_size_c_new,
cntx );
// Reinitialize the pool, but only if one of the parameters has
// changed in such a way that reinitialization would be required.
// In this case, the align_size is constant, as is num_blocks, so
// what this actually boils down to is that reinitialization of a
// pool occurs only if the block size for that pool has increased.
bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a );
bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b );
bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c );
}
void bli_mem_finalize_pools( void )
{
// Map each of the packbuf_t values to an index starting at zero.
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = &pools[ index_a ];
pool_t* pool_b = &pools[ index_b ];
pool_t* pool_c = &pools[ index_c ];
// Finalize the memory pools for A, B, and C.
bli_pool_finalize( pool_a );
bli_pool_finalize( pool_b );
bli_pool_finalize( pool_c );
}
// -----------------------------------------------------------------------------
void bli_mem_compute_pool_block_sizes( siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx )
{
const ind_t im = bli_cntx_get_ind_method( cntx );
siz_t bs_cand_a = 0;
siz_t bs_cand_b = 0;
siz_t bs_cand_c = 0;
num_t dt;
// Compute pool block sizes for each datatype and find the maximum
// size for each pool. This is done so that new pools do not need
// to be allocated if the user switches datatypes.
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
siz_t bs_dt_a;
siz_t bs_dt_b;
siz_t bs_dt_c;
// Avoid considering induced methods for real datatypes.
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
bli_mem_compute_pool_block_sizes_dt( dt,
&bs_dt_a,
&bs_dt_b,
&bs_dt_c,
cntx );
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
}
// Save the results.
*bs_a = bs_cand_a;
*bs_b = bs_cand_b;
*bs_c = bs_cand_c;
}
// -----------------------------------------------------------------------------
void bli_mem_compute_pool_block_sizes_dt( num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx )
{
siz_t size_dt = bli_datatype_size( dt );
blksz_t* mr;
blksz_t* nr;
blksz_t* mc;
blksz_t* kc;
blksz_t* nc;
dim_t mr_dt;
dim_t nr_dt;
dim_t max_mnr_dt;
dim_t mc_max_dt;
dim_t kc_max_dt;
dim_t nc_max_dt;
dim_t packmr_dt;
dim_t packnr_dt;
dim_t max_packmnr_dt;
dim_t scale_num_dt;
dim_t scale_den_dt;
dim_t pool_mc_dt, left_mc_dt;
dim_t pool_nc_dt, left_nc_dt;
dim_t pool_kc_dt;
//
// Find the larger of the two register blocksizes.
//
// Query the mr and nr blksz_t objects for the given method of
// execution.
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
// Extract the mr and nr values specific to the current datatype.
mr_dt = bli_blksz_get_def( dt, mr );
nr_dt = bli_blksz_get_def( dt, nr );
// Find the maximum of mr and nr.
max_mnr_dt = bli_max( mr_dt, nr_dt );
//
// Define local maximum cache blocksizes.
//
// Query the mc, kc, and nc blksz_t objects for native execution.
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
// Extract the maximum mc, kc, and nc values specific to the current
// datatype.
mc_max_dt = bli_blksz_get_max( dt, mc );
kc_max_dt = bli_blksz_get_max( dt, kc );
nc_max_dt = bli_blksz_get_max( dt, nc );
// Add max(mr,nr) to kc to make room for the nudging of kc at
// runtime to be a multiple of mr or nr for triangular operations
// trmm, trmm3, and trsm.
kc_max_dt += max_mnr_dt;
//
// Compute scaling factors.
//
// Compute integer scaling factors (numerator and denominator) used
// to account for situations when the packing register blocksizes are
// larger than the regular register blocksizes.
// In order to compute the scaling factors, we first have to determine
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
// needed ONLY because the amount of space allocated for a block of A
// and a panel of B needs to be such that MR and NR can be swapped (ie:
// A is packed with NR and B is packed with MR). This transformation is
// needed for right-side trsm when inducing an algorithm that (a) has
// favorable access patterns for column-stored C and (b) allows the
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
// kernels. We avoid integer division by cross-multiplying:
//
// ( packmr / mr ) >= ( packnr / nr )
// ( packmr / mr ) * nr >= packnr
// packmr * nr >= packnr * mr
//
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
// our scaling factors. Otherwise, we'll use packnr and nr.
packmr_dt = bli_blksz_get_max( dt, mr );
packnr_dt = bli_blksz_get_max( dt, nr );
if ( packmr_dt * nr_dt >=
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
scale_den_dt = mr_dt; }
else { scale_num_dt = packnr_dt;
scale_den_dt = nr_dt; }
//
// Compute pool block dimensions.
//
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
pool_kc_dt = ( kc_max_dt );
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
//
// Compute pool block sizes
//
// We add an extra micro-panel of space to the block sizes for A and B
// just to be sure any pre-loading performed by the micro-kernel does
// not cause a segmentation fault.
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -32,37 +33,21 @@
*/
#ifndef BLIS_MEM_H
#define BLIS_MEM_H
// -----------------------------------------------------------------------------
membrk_t* bli_mem_global_membrk( void );
siz_t bli_mem_pool_size( packbuf_t buf_type );
// -----------------------------------------------------------------------------
void bli_mem_init( void );
void bli_mem_reinit( cntx_t* cntx );
void bli_mem_finalize( void );
bool_t bli_mem_is_initialized( void );
// -----------------------------------------------------------------------------
void bli_mem_acquire_m( siz_t req_size,
packbuf_t buf_type,
mem_t* mem );
void bli_mem_acquire_v( siz_t req_size,
mem_t* mem );
void bli_mem_release( mem_t* mem );
siz_t bli_mem_pool_size( packbuf_t buf_type );
// -----------------------------------------------------------------------------
void bli_mem_init_pools( cntx_t* cntx );
void bli_mem_reinit_pools( cntx_t* cntx );
void bli_mem_finalize_pools( void );
void bli_mem_compute_pool_block_sizes( siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx );
void bli_mem_compute_pool_block_sizes_dt( num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx );
#endif

578
frame/base/bli_membrk.c Normal file
View File

@@ -0,0 +1,578 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_membrk_init
(
cntx_t* cntx,
membrk_t* membrk
)
{
bli_mutex_init( bli_membrk_mutex( membrk ) );
bli_membrk_init_pools( cntx, membrk );
bli_membrk_set_malloc_fp( bli_malloc_pool, membrk );
}
void bli_membrk_finalize
(
membrk_t* membrk
)
{
bli_membrk_set_malloc_fp( NULL, membrk );
bli_membrk_finalize_pools( membrk );
bli_mutex_finalize( bli_membrk_mutex( membrk ) );
}
void bli_membrk_acquire_m
(
membrk_t* membrk,
siz_t req_size,
packbuf_t buf_type,
mem_t* mem
)
{
pool_t* pool;
pblk_t* pblk;
dim_t pi;
siz_t block_size;
// Make sure the API is initialized.
//assert( membrk ); //??
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// For general-use buffer requests, such as those used by level-2
// operations, dynamically allocating memory is sufficient.
// Note that we use the malloc()-style memory allocation function
// that is stored in the membrk_t object.
void* buf_sys = bli_membrk_malloc( req_size, membrk );
// Initialize the mem_t object with:
// - the address of the memory block,
// - the buffer type (a packbuf_t value),
// - the size of the requested region,
// - the membrk_t from which the mem_t entry was acquired.
// NOTE: We do not initialize the pool field since this block did not
// come from a memory pool.
bli_mem_set_buffer( buf_sys, mem );
bli_mem_set_buf_sys( buf_sys, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_size( req_size, mem );
bli_mem_set_membrk( membrk, mem );
}
else
{
// This branch handles cases where the memory block needs to come
// from an internal memory pool, in which blocks are allocated once
// and then recycled.
// Map the requested packed buffer type to a zero-based index, which
// we then use to select the corresponding memory pool.
pi = bli_packbuf_index( buf_type );
pool = bli_membrk_pool( pi, membrk );
// Unconditionally perform error checking on the memory pool.
{
err_t e_val;
// Make sure that the requested matrix size fits inside of a block
// of the corresponding pool. If it does not, the pool was somehow
// initialized improperly.
e_val = bli_check_requested_block_size_for_pool( req_size, pool );
bli_check_error_code( e_val );
}
// Extract the address of the pblk_t struct within the mem_t.
pblk = bli_mem_pblk( mem );
// BEGIN CRITICAL SECTION
bli_membrk_lock( membrk );
{
// Checkout a block from the pool. If the pool is exhausted,
// either because it is still empty or because all blocks have
// been checked out already, additional blocks will be allocated
// automatically, as-needed. Note that the addresses are stored
// directly into the mem_t struct since pblk is the address of
// the struct's pblk_t field.
bli_pool_checkout_block( pblk, pool );
// Query the size of the blocks in the pool so we can store it in
// the mem_t object. At this point, it is guaranteed to be at
// least as large as req_size. (NOTE: We must perform the query
// within the critical section to ensure that the pool hasn't
// changed, as unlikely as that would be.)
block_size = bli_pool_block_size( pool );
}
bli_membrk_unlock( membrk );
// END CRITICAL SECTION
// Initialize the mem_t object with:
// - the buffer type (a packbuf_t value),
// - the address of the memory pool to which it belongs,
// - the size of the contiguous memory block (NOT the size of the
// requested region),
// - the membrk_t from which the mem_t entry was acquired.
// The actual addresses (system and aligned) are already stored in
// the mem_t struct's pblk_t field
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_pool( pool, mem );
bli_mem_set_size( block_size, mem );
bli_mem_set_membrk( membrk, mem );
}
}
void bli_membrk_release
(
mem_t* mem
)
{
packbuf_t buf_type;
pool_t* pool;
pblk_t* pblk;
siz_t block_size_cur;
siz_t block_size_prev;
membrk_t* membrk;
// Extract the membrk_t address from the mem_t object.
membrk = bli_mem_membrk( mem );
// Extract the buffer type so we know what kind of memory was allocated.
buf_type = bli_mem_buf_type( mem );
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
void* buf_sys = bli_mem_buf_sys( mem );
// For general-use buffers, we dynamically allocate memory, and so
// here we need to free.
// Note that we use the free()-style memory release function that
// is stored in the membrk_t object.
bli_membrk_free( buf_sys, membrk );
}
else
{
// Extract the address of the pool from which the memory was
// allocated.
pool = bli_mem_pool( mem );
// Extract the address of the pblk_t struct within the mem_t struct.
pblk = bli_mem_pblk( mem );
// Query the size of the blocks that were in the pool at the time
// the pblk_t was checked out. (This is used below, in the critical
// section.)
block_size_prev = bli_mem_size( mem );
// BEGIN CRITICAL SECTION
bli_membrk_lock( membrk );
{
// Query the size of the blocks currently in the pool.
block_size_cur = bli_pool_block_size( pool );
// If the block size of the pool has changed since the pblk_t
// was checked out, then we need to free the pblk_t rather
// than check it back in. Why? Because the pool's block size
// has (most likely) increased to meet changing needs (example:
// larger cache blocksizes). Thus, the current pblk_t's smaller
// allocated size is of no use anymore.
if ( block_size_cur != block_size_prev )
{
// Free the pblk_t using the appropriate function in the
// pool API.
bli_pool_free_block( pblk );
}
else
{
// Check the block back into the pool.
bli_pool_checkin_block( pblk, pool );
}
}
bli_membrk_unlock( membrk );
// END CRITICAL SECTION
}
// Clear the mem_t object so that it appears unallocated. This clears:
// - the pblk_t struct's fields (ie: the buffer addresses)
// - the pool field
// - the size field
// - the membrk field
// NOTE: We do not clear the buf_type field since there is no
// "uninitialized" value for packbuf_t.
bli_mem_clear( mem );
}
void bli_membrk_acquire_v
(
membrk_t* membrk,
siz_t req_size,
mem_t* mem
)
{
bli_membrk_acquire_m( membrk,
req_size,
BLIS_BUFFER_FOR_GEN_USE,
mem );
}
siz_t bli_membrk_pool_size
(
membrk_t* membrk,
packbuf_t buf_type
)
{
siz_t r_val;
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// We don't (yet) track the amount of general-purpose
// memory that is currently allocated.
r_val = 0;
}
else
{
dim_t pool_index;
pool_t* pool;
// Acquire the pointer to the pool corresponding to the buf_type
// provided.
pool_index = bli_packbuf_index( buf_type );
pool = bli_membrk_pool( pool_index, membrk );
// Compute the pool "size" as the product of the block size
// and the number of blocks in the pool.
r_val = bli_pool_block_size( pool ) *
bli_pool_num_blocks( pool );
}
return r_val;
}
// -----------------------------------------------------------------------------
void bli_membrk_init_pools
(
cntx_t* cntx,
membrk_t* membrk
)
{
// Map each of the packbuf_t values to an index starting at zero.
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
// Start with empty pools.
const dim_t num_blocks_a = 0;
const dim_t num_blocks_b = 0;
const dim_t num_blocks_c = 0;
siz_t block_size_a = 0;
siz_t block_size_b = 0;
siz_t block_size_c = 0;
// Determine the block size for each memory pool.
bli_membrk_compute_pool_block_sizes( &block_size_a,
&block_size_b,
&block_size_c,
cntx );
// Initialize the memory pools for A, B, and C.
bli_pool_init( num_blocks_a, block_size_a, align_size, pool_a );
bli_pool_init( num_blocks_b, block_size_b, align_size, pool_b );
bli_pool_init( num_blocks_c, block_size_c, align_size, pool_c );
}
void bli_membrk_reinit_pools
(
cntx_t* cntx,
membrk_t* membrk
)
{
// Map each of the packbuf_t values to an index starting at zero.
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
// Query the number of blocks currently allocated in each pool.
const dim_t num_blocks_a = bli_pool_num_blocks( pool_a );
const dim_t num_blocks_b = bli_pool_num_blocks( pool_b );
const dim_t num_blocks_c = bli_pool_num_blocks( pool_c );
siz_t block_size_a_new = 0;
siz_t block_size_b_new = 0;
siz_t block_size_c_new = 0;
// Determine the context-implied block size needed for each pool.
bli_membrk_compute_pool_block_sizes( &block_size_a_new,
&block_size_b_new,
&block_size_c_new,
cntx );
// Reinitialize the pool, but only if one of the parameters has
// changed in such a way that reinitialization would be required.
// In this case, the align_size is constant, as is num_blocks, so
// what this actually boils down to is that reinitialization of a
// pool occurs only if the block size for that pool has increased.
bli_pool_reinit_if( num_blocks_a, block_size_a_new, align_size, pool_a );
bli_pool_reinit_if( num_blocks_b, block_size_b_new, align_size, pool_b );
bli_pool_reinit_if( num_blocks_c, block_size_c_new, align_size, pool_c );
}
void bli_membrk_finalize_pools
(
membrk_t* membrk
)
{
// Map each of the packbuf_t values to an index starting at zero.
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = bli_membrk_pool( index_a, membrk );
pool_t* pool_b = bli_membrk_pool( index_b, membrk );
pool_t* pool_c = bli_membrk_pool( index_c, membrk );
// Finalize the memory pools for A, B, and C.
bli_pool_finalize( pool_a );
bli_pool_finalize( pool_b );
bli_pool_finalize( pool_c );
}
// -----------------------------------------------------------------------------
void bli_membrk_compute_pool_block_sizes
(
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
)
{
const ind_t im = bli_cntx_get_ind_method( cntx );
siz_t bs_cand_a = 0;
siz_t bs_cand_b = 0;
siz_t bs_cand_c = 0;
num_t dt;
// Compute pool block sizes for each datatype and find the maximum
// size for each pool. This is done so that new pools do not need
// to be allocated if the user switches datatypes.
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
siz_t bs_dt_a;
siz_t bs_dt_b;
siz_t bs_dt_c;
// Avoid considering induced methods for real datatypes.
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
bli_membrk_compute_pool_block_sizes_dt( dt,
&bs_dt_a,
&bs_dt_b,
&bs_dt_c,
cntx );
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
}
// Save the results.
*bs_a = bs_cand_a;
*bs_b = bs_cand_b;
*bs_c = bs_cand_c;
}
// -----------------------------------------------------------------------------
void bli_membrk_compute_pool_block_sizes_dt
(
num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
)
{
siz_t size_dt = bli_datatype_size( dt );
blksz_t* mr;
blksz_t* nr;
blksz_t* mc;
blksz_t* kc;
blksz_t* nc;
dim_t mr_dt;
dim_t nr_dt;
dim_t max_mnr_dt;
dim_t mc_max_dt;
dim_t kc_max_dt;
dim_t nc_max_dt;
dim_t packmr_dt;
dim_t packnr_dt;
dim_t max_packmnr_dt;
dim_t scale_num_dt;
dim_t scale_den_dt;
dim_t pool_mc_dt, left_mc_dt;
dim_t pool_nc_dt, left_nc_dt;
dim_t pool_kc_dt;
//
// Find the larger of the two register blocksizes.
//
// Query the mr and nr blksz_t objects for the given method of
// execution.
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
// Extract the mr and nr values specific to the current datatype.
mr_dt = bli_blksz_get_def( dt, mr );
nr_dt = bli_blksz_get_def( dt, nr );
// Find the maximum of mr and nr.
max_mnr_dt = bli_max( mr_dt, nr_dt );
//
// Define local maximum cache blocksizes.
//
// Query the mc, kc, and nc blksz_t objects for native execution.
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
// Extract the maximum mc, kc, and nc values specific to the current
// datatype.
mc_max_dt = bli_blksz_get_max( dt, mc );
kc_max_dt = bli_blksz_get_max( dt, kc );
nc_max_dt = bli_blksz_get_max( dt, nc );
// Add max(mr,nr) to kc to make room for the nudging of kc at
// runtime to be a multiple of mr or nr for triangular operations
// trmm, trmm3, and trsm.
kc_max_dt += max_mnr_dt;
//
// Compute scaling factors.
//
// Compute integer scaling factors (numerator and denominator) used
// to account for situations when the packing register blocksizes are
// larger than the regular register blocksizes.
// In order to compute the scaling factors, we first have to determine
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
// needed ONLY because the amount of space allocated for a block of A
// and a panel of B needs to be such that MR and NR can be swapped (ie:
// A is packed with NR and B is packed with MR). This transformation is
// needed for right-side trsm when inducing an algorithm that (a) has
// favorable access patterns for column-stored C and (b) allows the
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
// kernels. We avoid integer division by cross-multiplying:
//
// ( packmr / mr ) >= ( packnr / nr )
// ( packmr / mr ) * nr >= packnr
// packmr * nr >= packnr * mr
//
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
// our scaling factors. Otherwise, we'll use packnr and nr.
packmr_dt = bli_blksz_get_max( dt, mr );
packnr_dt = bli_blksz_get_max( dt, nr );
if ( packmr_dt * nr_dt >=
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
scale_den_dt = mr_dt; }
else { scale_num_dt = packnr_dt;
scale_den_dt = nr_dt; }
//
// Compute pool block dimensions.
//
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
pool_kc_dt = ( kc_max_dt );
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
//
// Compute pool block sizes
//
// We add an extra micro-panel of space to the block sizes for A and B
// just to be sure any pre-loading performed by the micro-kernel does
// not cause a segmentation fault.
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
}

169
frame/base/bli_membrk.h Normal file
View File

@@ -0,0 +1,169 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MEMBRK_H
#define BLIS_MEMBRK_H
// -- Memory broker object type --
typedef struct membrk_s
{
pool_t pools[3];
mtx_t mutex;
malloc_ft malloc_fp;
free_ft free_fp;
} membrk_t;
#define bli_membrk_pool( pool_index, membrk_p ) \
\
( (membrk_p)->pools + (pool_index) )
#define bli_membrk_mutex( membrk_p ) \
\
( &( (membrk_p)->mutex ) )
#define bli_membrk_malloc_fp( membrk_p ) \
\
( (membrk_p)->malloc_fp )
#define bli_membrk_free_fp( membrk_p ) \
\
( (membrk_p)->free_fp )
#define bli_membrk_set_malloc_fp( _malloc_fp, membrk_p ) \
{\
(membrk_p)->malloc_fp = _malloc_fp; \
}
#define bli_membrk_set_free_fp( _free_fp, membrk_p ) \
{\
(membrk_p)->free_fp = _free_fp; \
}
#define bli_membrk_lock( membrk_p ) \
{\
bli_mutex_lock( &((membrk_p)->mutex) ); \
}
#define bli_membrk_unlock( membrk_p ) \
{\
bli_mutex_unlock( &((membrk_p)->mutex) ); \
}
#define bli_membrk_malloc( size, membrk ) \
\
/* Call the malloc()-style function in membrk. */ \
((membrk)->malloc_fp)( size )
#define bli_membrk_free( buf_p, membrk ) \
\
/* Call the free()-style function in membrk. */ \
((membrk)->free_fp)( buf_p )
// -----------------------------------------------------------------------------
void bli_membrk_init
(
cntx_t* cntx,
membrk_t* membrk
);
void bli_membrk_finalize
(
membrk_t* membrk
);
void bli_membrk_acquire_m
(
membrk_t* membrk,
siz_t req_size,
packbuf_t buf_type,
mem_t* mem
);
void bli_membrk_acquire_v
(
membrk_t* membrk,
siz_t req_size,
mem_t* mem
);
void bli_membrk_release
(
mem_t* mem
);
siz_t bli_membrk_pool_size
(
membrk_t* membrk,
packbuf_t buf_type
);
// ----------------------------------------------------------------------------
void bli_membrk_init_pools
(
cntx_t* cntx,
membrk_t* membrk
);
void bli_membrk_reinit_pools
(
cntx_t* cntx,
membrk_t* membrk
);
void bli_membrk_finalize_pools
(
membrk_t* membrk
);
void bli_membrk_compute_pool_block_sizes
(
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
);
void bli_membrk_compute_pool_block_sizes_dt
(
num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
);
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -58,6 +59,10 @@
\
( (mem_p)->pool )
#define bli_mem_membrk( mem_p ) \
\
( (mem_p)->membrk )
#define bli_mem_size( mem_p ) \
\
( (mem_p)->size )
@@ -90,12 +95,17 @@
#define bli_mem_set_buf_type( buf_type0, mem_p ) \
{ \
mem_p->buf_type = buf_type0; \
(mem_p)->buf_type = buf_type0; \
}
#define bli_mem_set_pool( pool0, mem_p ) \
{ \
mem_p->pool = pool0; \
(mem_p)->pool = pool0; \
}
#define bli_mem_set_membrk( membrk0, mem_p ) \
{ \
(mem_p)->membrk = membrk0; \
}
#define bli_mem_set_size( size0, mem_p ) \
@@ -109,6 +119,7 @@
bli_mem_set_buf_sys( NULL, mem_p ); \
bli_mem_set_pool( NULL, mem_p ); \
bli_mem_set_size( 0, mem_p ); \
bli_mem_set_membrk( NULL, mem_p ); \
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -957,14 +958,14 @@ bli_obj_width_stored( obj )
}
// Release object's pack (and cast) memory entries back to memory manager
// Release object's pack mem_t entries back to memory manager
#define bli_obj_release_pack( obj_p ) \
{ \
mem_t* pack_mem_ = bli_obj_pack_mem( *(obj_p) ); \
\
if ( bli_mem_is_alloc( pack_mem_ ) ) \
bli_mem_release( pack_mem_ ); \
bli_membrk_release( pack_mem_ ); \
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -173,7 +174,6 @@ typedef scomplex f77_scomplex;
typedef dcomplex f77_dcomplex;
//
// -- BLIS info bit field offsets ----------------------------------------------
//
@@ -505,6 +505,10 @@ typedef enum
// -- BLIS misc. structure types -----------------------------------------------
//
// -- Mutex type --
typedef struct mtx_s mtx_t;
// -- Pool block type --
typedef struct
@@ -527,6 +531,19 @@ typedef struct
siz_t align_size;
} pool_t;
// -- Memory broker object type --
typedef struct membrk_s membrk_t;
/*
{
pool_t pools[3];
mtx_t mutex;
malloc_ft malloc_fp;
free_ft free_fp;
} membrk_t;
*/
// -- Memory object type --
typedef struct mem_s
@@ -534,6 +551,7 @@ typedef struct mem_s
pblk_t pblk;
packbuf_t buf_type;
pool_t* pool;
membrk_t* membrk;
siz_t size;
} mem_t;
@@ -910,6 +928,7 @@ typedef struct cntx_s
pack_t schema_b;
pack_t schema_c;
membrk_t* membrk;
} cntx_t;

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -103,6 +104,7 @@ extern "C" {
#include "bli_cntx.h"
#include "bli_gks.h"
#include "bli_ind.h"
#include "bli_membrk.h"
#include "bli_pool.h"
#include "bli_mem.h"
#include "bli_part.h"

49
frame/thread/bli_mutex.h Normal file
View File

@@ -0,0 +1,49 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MUTEX_H
#define BLIS_MUTEX_H
// Include definitions (mostly mtx_t) specific to the method of
// multithreading.
#include "bli_mutex_single.h"
#include "bli_mutex_openmp.h"
#include "bli_mutex_pthreads.h"
// Thread mutex prototypes.
#endif

View File

@@ -0,0 +1,72 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MUTEX_OPENMP_H
#define BLIS_MUTEX_OPENMP_H
// Define mutex_t for situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
#include <omp.h>
// Define mtx_t.
typedef struct mtx_s
{
omp_lock_t mutex;
} mtx_t;
// Define macros to operate on OpenMP-based mtx_t.
#define bli_mutex_init( mtx_p ) \
{ \
omp_init_lock( mtx_p ); \
}
#define bli_mutex_finalize( mtx_p ) \
{ \
omp_destroy_lock( mtx_p ); \
}
#define bli_mutex_lock( mtx_p ) \
{ \
omp_set_lock( mtx_p ); \
}
#define bli_mutex_unlock( mtx_p ) \
{ \
omp_unset_lock( mtx_p ); \
}
#endif
#endif

View File

@@ -0,0 +1,72 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MUTEX_PTHREADS_H
#define BLIS_MUTEX_PTHREADS_H
// Define mutex_t for situations when POSIX multithreading is enabled.
#ifdef BLIS_ENABLE_PTHREADS
#include <pthread.h>
// Define mtx_t.
typedef struct mtx_s
{
pthread_mutex_t mutex;
} mtx_t;
// Define macros to operate on pthread-based mtx_t.
#define bli_mutex_init( mtx_p ) \
{ \
pthread_mutex_init( mtx_p ); \
}
#define bli_mutex_finalize( mtx_p ) \
{ \
pthread_mutex_destroy( mtx_p ); \
}
#define bli_mutex_lock( mtx_p ) \
{ \
pthread_mutex_lock( mtx_p ); \
}
#define bli_mutex_unlock( mtx_p ) \
{ \
pthread_mutex_unlock( mtx_p ); \
}
#endif
#endif

View File

@@ -0,0 +1,65 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MUTEX_SINGLE_H
#define BLIS_MUTEX_SINGLE_H
// Define mtx_t for situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
// Define mtx_t.
typedef struct mtx_s
{
} mtx_t;
// Define macros to operate on pthread-based mtx_t.
#define bli_mutex_init( mtx_p ) \
{ \
}
#define bli_mutex_finalize( mtx_p ) \
{ \
}
#define bli_mutex_lock( mtx_p ) \
{ \
}
#define bli_mutex_unlock( mtx_p ) \
{ \
}
#endif
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016 Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -51,6 +52,9 @@
#define BLIS_ENABLE_MULTITHREADING
#endif
// Include thread mutex (mtx_t) object definitions and prototypes.
#include "bli_mutex.h"
// Include thread communicator (thrcomm_t) object definitions and prototypes.
#include "bli_thrcomm.h"

View File

@@ -63,8 +63,8 @@ void bli_sgemm_opt_8x12(
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -1112,8 +1112,8 @@ void bli_dgemm_opt_6x8(
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(

View File

@@ -47,8 +47,8 @@ void bli_dgemm_opt_4x4
cntx_t* restrict cntx
)
{
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(

View File

@@ -271,6 +271,8 @@ void bli_dgemm_asm_30x8
int * offsetPtr = &offsets[0];
uint64_t k64 = k;
#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
#endif
@@ -288,7 +290,7 @@ void bli_dgemm_asm_30x8
vpxord zmm0, zmm0, zmm0
vmovaps zmm1, zmm0 //clear out registers
vmovaps zmm2, zmm0
mov rsi, k //loop index
mov rsi, k64 //loop index
vmovaps zmm3, zmm0
mov r11, rs_c //load row stride
@@ -312,7 +314,7 @@ void bli_dgemm_asm_30x8
mov rcx, c //load address of c for prefetching
vmovaps zmm13, zmm0
vmovaps zmm14, zmm0
mov r8, k
mov r8, k64
vmovaps zmm15, zmm0
vmovaps zmm16, zmm0
@@ -381,7 +383,7 @@ void bli_dgemm_asm_30x8
//Alternate main loop, with no prefetching of C
//Used when <= 40 iterations
CONSIDER_UNDER_40:
mov rsi, k
mov rsi, k64
test rsi, rsi
je POSTACCUM
LOOP_UNDER_40:

View File

@@ -271,6 +271,8 @@ void bli_sgemm_asm_30x16
int * offsetPtr = &offsets[0];
uint64_t k64 = k;
#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
#endif
@@ -288,7 +290,7 @@ void bli_sgemm_asm_30x16
vpxord zmm0, zmm0, zmm0
vmovaps zmm1, zmm0 //clear out registers
vmovaps zmm2, zmm0
mov rsi, k //loop index
mov rsi, k64 //loop index
vmovaps zmm3, zmm0
mov r11, rs_c //load row stride
@@ -312,7 +314,7 @@ void bli_sgemm_asm_30x16
mov rcx, c //load address of c for prefetching
vmovaps zmm13, zmm0
vmovaps zmm14, zmm0
mov r8, k
mov r8, k64
vmovaps zmm15, zmm0
vmovaps zmm16, zmm0
@@ -381,7 +383,7 @@ void bli_sgemm_asm_30x16
//Alternate main loop, with no prefetching of C
//Used when <= 40 iterations
CONSIDER_UNDER_40:
mov rsi, k
mov rsi, k64
test rsi, rsi
je POSTACCUM
LOOP_UNDER_40:

View File

@@ -97,8 +97,8 @@ void bli_sgemm_asm_8x8_fma4
cntx_t* restrict cntx
)
{
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(

View File

@@ -80,8 +80,8 @@ void bli_sgemm_asm_6x16
//void* a_next = bli_auxinfo_next_a( data );
//void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -322,23 +322,6 @@ void bli_sgemm_asm_6x16
"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // determine if
" \n\t" // c % 32 == 0, AND
" \n\t" // 4*rs_c % 32 == 0, AND
" \n\t" // cs_c == 1
" \n\t" // ie: aligned, ldim aligned, and
" \n\t" // row-stored
" \n\t"
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
"testq $31, %%rdi \n\t" // set ZF if (4*rs_c) & 32 is zero.
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
" \n\t" // and(bl,bh) followed by
" \n\t" // and(bh,al) will reveal result
" \n\t"
" \n\t" // now avoid loading C if beta == 0
" \n\t"
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
@@ -346,10 +329,8 @@ void bli_sgemm_asm_6x16
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/row-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .SROWSTORED \n\t" // jump to row storage case
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
"jz .SROWSTORED \n\t" // jump to row storage case
" \n\t"
" \n\t"
" \n\t"
@@ -439,63 +420,51 @@ void bli_sgemm_asm_6x16
".SROWSTORED: \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t"
"vmovups %%ymm4, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t"
"vmovups %%ymm5, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t"
"vmovups %%ymm6, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm7, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t"
"vmovups %%ymm7, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t"
"vmovups %%ymm8, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm9, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t"
"vmovups %%ymm9, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t"
"vmovups %%ymm10, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t"
"vmovups %%ymm11, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t"
"vmovups %%ymm12, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm13, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t"
"vmovups %%ymm13, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t"
"vmovups %%ymm14, (%%rcx) \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm15, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t"
"vmovups %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
@@ -505,10 +474,9 @@ void bli_sgemm_asm_6x16
" \n\t"
" \n\t"
".SBETAZERO: \n\t"
" \n\t" // check if aligned/row-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .SROWSTORBZ \n\t" // jump to row storage case
" \n\t"
"cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
"jz .SROWSTORBZ \n\t" // jump to row storage case
" \n\t"
" \n\t"
" \n\t"
@@ -586,38 +554,38 @@ void bli_sgemm_asm_6x16
".SROWSTORBZ: \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm4, (%%rcx) \n\t"
"vmovups %%ymm4, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm5, (%%rdx) \n\t"
"vmovups %%ymm5, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovaps %%ymm6, (%%rcx) \n\t"
"vmovups %%ymm6, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm7, (%%rdx) \n\t"
"vmovups %%ymm7, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm8, (%%rcx) \n\t"
"vmovups %%ymm8, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm9, (%%rdx) \n\t"
"vmovups %%ymm9, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm10, (%%rcx) \n\t"
"vmovups %%ymm10, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm11, (%%rdx) \n\t"
"vmovups %%ymm11, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm12, (%%rcx) \n\t"
"vmovups %%ymm12, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm13, (%%rdx) \n\t"
"vmovups %%ymm13, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm14, (%%rcx) \n\t"
"vmovups %%ymm14, (%%rcx) \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm15, (%%rdx) \n\t"
"vmovups %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
@@ -693,8 +661,8 @@ void bli_dgemm_asm_6x8
//void* a_next = bli_auxinfo_next_a( data );
//void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -935,23 +903,6 @@ void bli_dgemm_asm_6x8
//"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
" \n\t"
" \n\t"
" \n\t"
" \n\t" // determine if
" \n\t" // c % 32 == 0, AND
" \n\t" // 8*rs_c % 32 == 0, AND
" \n\t" // cs_c == 1
" \n\t" // ie: aligned, ldim aligned, and
" \n\t" // row-stored
" \n\t"
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
"testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero.
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
" \n\t" // and(bl,bh) followed by
" \n\t" // and(bh,al) will reveal result
" \n\t"
" \n\t" // now avoid loading C if beta == 0
" \n\t"
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
@@ -959,10 +910,8 @@ void bli_dgemm_asm_6x8
"je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/row-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .DROWSTORED \n\t" // jump to row storage case
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
"jz .DROWSTORED \n\t" // jump to row storage case
" \n\t"
" \n\t"
" \n\t"
@@ -1050,63 +999,51 @@ void bli_dgemm_asm_6x8
".DROWSTORED: \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
"vmovups %%ymm4, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
"vmovups %%ymm5, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t"
"vmovups %%ymm6, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm7, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t"
"vmovups %%ymm7, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t"
"vmovups %%ymm8, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm9, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t"
"vmovups %%ymm9, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
"vmovups %%ymm10, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
"vmovups %%ymm11, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t"
"vmovups %%ymm12, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm13, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t"
"vmovups %%ymm13, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t"
"vmovups %%ymm14, (%%rcx) \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213pd %%ymm15, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t"
"vmovups %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
@@ -1116,10 +1053,9 @@ void bli_dgemm_asm_6x8
" \n\t"
" \n\t"
".DBETAZERO: \n\t"
" \n\t" // check if aligned/row-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .DROWSTORBZ \n\t" // jump to row storage case
" \n\t"
"cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
"jz .DROWSTORBZ \n\t" // jump to row storage case
" \n\t"
" \n\t"
" \n\t"
@@ -1195,38 +1131,38 @@ void bli_dgemm_asm_6x8
".DROWSTORBZ: \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm4, (%%rcx) \n\t"
"vmovups %%ymm4, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm5, (%%rdx) \n\t"
"vmovups %%ymm5, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovaps %%ymm6, (%%rcx) \n\t"
"vmovups %%ymm6, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm7, (%%rdx) \n\t"
"vmovups %%ymm7, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm8, (%%rcx) \n\t"
"vmovups %%ymm8, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm9, (%%rdx) \n\t"
"vmovups %%ymm9, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm10, (%%rcx) \n\t"
"vmovups %%ymm10, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm11, (%%rdx) \n\t"
"vmovups %%ymm11, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm12, (%%rcx) \n\t"
"vmovups %%ymm12, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm13, (%%rdx) \n\t"
"vmovups %%ymm13, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm14, (%%rcx) \n\t"
"vmovups %%ymm14, (%%rcx) \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm15, (%%rdx) \n\t"
"vmovups %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"

View File

@@ -49,8 +49,8 @@ void bli_sgemm_asm_8x4
//void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -851,8 +851,8 @@ void bli_dgemm_asm_4x4
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(

View File

@@ -66,8 +66,8 @@ void bli_dgemmtrsm_l_asm_4x4
{
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(

View File

@@ -66,8 +66,8 @@ void bli_dgemmtrsm_u_asm_4x4
{
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile

View File

@@ -52,8 +52,8 @@ void bli_sgemm_asm_16x3
void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 8;
dim_t k_left = k % 8;
uint64_t k_iter = k / 8;
uint64_t k_left = k % 8;
__asm__ volatile
(

View File

@@ -52,8 +52,8 @@ void bli_sgemm_asm_8x8
//void* a_next = bli_auxinfo_next_a( data );
//void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -1052,8 +1052,8 @@ void bli_dgemm_asm_8x4
//void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -1739,8 +1739,8 @@ void bli_cgemm_asm_8x4
//void* a_next = bli_auxinfo_next_a( data );
void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(
@@ -2715,8 +2715,8 @@ void bli_zgemm_asm_4x4
//void* a_next = bli_auxinfo_next_a( data );
//void* b_next = bli_auxinfo_next_b( data );
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;
__asm__ volatile
(