"Merge Selective Packing code from amd branch flame/blis"

Change-Id: Ifbdf49735f56a66fbbc96dab6d3ca6069302daed
This commit is contained in:
Devrajegowda, Kiran
2019-12-16 14:46:19 +05:30
committed by dzambare
parent 307ddc3110
commit 6b5c68b9ed
52 changed files with 4202 additions and 762 deletions

View File

@@ -176,6 +176,16 @@ void bli_cntx_init_haswell( cntx_t* cntx )
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(

View File

@@ -186,6 +186,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
cntx
);
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(

View File

@@ -73,7 +73,11 @@
// Prototype reference implementation of small/unpacked matrix handler.
#include "bli_l3_sup_ref.h"
#include "bli_l3_sup_int.h"
#include "bli_l3_sup_vars.h"
#include "bli_l3_sup_packm_a.h"
#include "bli_l3_sup_packm_b.h"
#include "bli_l3_sup_packm_var.h"
// Prototype microkernel wrapper APIs.
#include "bli_l3_ukr_oapi.h"

View File

@@ -104,14 +104,6 @@ err_t bli_gemmsup
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
#if 0
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#endif
// Return early if a microkernel preference-induced transposition would
// have been performed and shifted the dimensions outside of the space
// of sup-handled problems.
@@ -138,6 +130,12 @@ err_t bli_gemmsup
}
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#if 0
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );

173
frame/3/bli_l3_sup_int.c Normal file
View File

@@ -0,0 +1,173 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
err_t bli_gemmsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
#if 0
//bli_gemmsup_ref_var2
//bli_gemmsup_ref_var1
#if 0
bli_gemmsup_ref_var1n
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
if ( is_rrr_rrc_rcr_crr )
{
bli_gemmsup_ref_var2m
(
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
else
{
bli_gemmsup_ref_var2m
(
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
return BLIS_SUCCESS;
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride.
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t mu = m / MR;
const dim_t nu = n / NR;
if ( mu >= nu )
//if ( m % 2 == 1 && n % 2 == 1 )
{
#ifdef TRACEVAR
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
}
else // if ( mu < nu )
{
#ifdef TRACEVAR
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mt = bli_obj_width( c );
const dim_t nt = bli_obj_length( c );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t mu = mt / MR;
const dim_t nu = nt / NR;
if ( mu >= nu )
//if ( mt % 2 == 1 && nt % 2 == 1 )
{
#ifdef TRACEVAR
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
}
else // if ( mu < nu )
{
#ifdef TRACEVAR
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, cntl, thread );
}
// *requires nudging of mc,nc up to be a multiple of nr,mr.
}
// Return success so that the caller knows that we computed the solution.
return BLIS_SUCCESS;
}

46
frame/3/bli_l3_sup_int.h Normal file
View File

@@ -0,0 +1,46 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
err_t bli_gemmsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
stor3_t stor_id, \
trans_t transc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_a )

View File

@@ -0,0 +1,115 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool_t will_pack, \
stor3_t stor_id, \
trans_t transc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict x, inc_t rs_x, inc_t cs_x, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_b )

View File

@@ -0,0 +1,329 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces to the variants.
//
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic; \
dim_t ic0; \
doff_t ic_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool_t row_stored = bli_is_col_packed( schema ); \
/*bool_t col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
vs_c = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
vs_c = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
} \
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
ic += ic_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
{ \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC(ch,packm_cxk) \
( \
conjc, \
schema, \
panel_dim_i, \
panel_dim_max, \
panel_len_i, \
panel_len_max_i, \
kappa_cast, \
c_use, vs_c, ldc, \
p_use, ldp, \
cntx \
); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
} \
\
}
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_obarrier( thread ); \
} \
*/
/*
if ( bli_is_4mi_packed( schema ) ) { \
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
if ( col_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
if ( row_stored ) { \
if ( 0 ) \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
} \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/

View File

@@ -0,0 +1,60 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces to the variants.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( packm_sup_var1 )

View File

@@ -45,6 +45,11 @@ err_t bli_gemmsup_ref
rntm_t* rntm
)
{
// This function implements the default gemmsup handler. If you are a
// BLIS developer and wish to use a different gemmsup handler, please
// register a different function pointer in the context in your
// sub-configuration's bli_cntx_init_*() function.
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
@@ -85,6 +90,14 @@ err_t bli_gemmsup_ref
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
#endif
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
// May not need these here since packm_sup infers the schemas based
// on the stor3_t id. (This would also mean that they don't need to
// be passed into the thread decorator below.)
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
return
bli_l3_sup_thread_decorator

View File

@@ -119,6 +119,9 @@ void bli_gemmsup_ref_var1n
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -186,6 +189,8 @@ void bli_gemmsup_ref_var1n
// Invoke the function.
f
(
packa,
packb,
conja,
conjb,
m,
@@ -207,6 +212,8 @@ void bli_gemmsup_ref_var1n
// Invoke the function (transposing the operation).
f
(
packb,
packa,
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
@@ -249,6 +256,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
@@ -270,16 +279,16 @@ void PASTEMAC(ch,varname) \
} \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* This transposition of the stor3_t id value is inherent to variant 1.
The reason: we assume that variant 2 is the "main" variant. The
consequence of this is that we assume that the millikernels that
iterate over m are registered to the kernel group associated with
the kernel preference. So, regardless of whether the mkernels are
row- or column-preferential, millikernels that iterate over n are
always placed in the slots for the opposite kernel group. */ \
iterate over m are registered to the "primary" kernel group associated
with the kernel IO preference; similarly, mkernels that iterate over
n are assumed to be registered to the "non-primary" group associated
with the ("non-primary") anti-preference. Note that this pattern holds
regardless of whether the mkernel set has a row or column preference.)
See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
stor_id = bli_stor3_trans( stor_id ); \
\
/* Query the context for various blocksizes. */ \
@@ -326,7 +335,9 @@ void PASTEMAC(ch,varname) \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
} \
\
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
NOTE: This is unique to variant 1 (ie: not performed in variant 2)
because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
\
@@ -346,7 +357,11 @@ void PASTEMAC(ch,varname) \
const inc_t icstep_b = cs_b; \
\
const inc_t jrstep_c = rs_c * MR; \
\
/*
const inc_t jrstep_a = rs_a * MR; \
( void )jrstep_a; \
*/ \
\
const inc_t irstep_c = cs_c * NR; \
const inc_t irstep_b = cs_b * NR; \
@@ -435,6 +450,45 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = m_local % NC; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Prepare the packing destination buffer. If packing is not requested for
matrix B, this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_a) \
( \
packa, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
stor_id, \
NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
cntx, \
rntm, \
&mem_a, \
thread \
); \
\
/* Prepare the packing destination buffer. If packing is not requested for
matrix B, this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_b) \
( \
packb, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
stor_id, \
KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
cntx, \
rntm, \
&mem_b, \
thread \
); \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -537,6 +591,39 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = n_local % MC; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
stor_id, \
BLIS_NO_TRANSPOSE, \
nc_cur, kc_cur, MR, \
one, \
a_pc, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
&mem_a, \
thread \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict a_pc_use = a_use; \
\
/* We don't need to embed the panel stride of A within the auxinfo_t
object because this variant iterates through A in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
\
/* Loop over the n dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -622,6 +709,41 @@ void PASTEMAC(ch,varname) \
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then b_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id.
NOTE: packing matrix B in this panel-block algorithm corresponds
to packing matrix A in the block-panel algorithm. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
stor_id, \
BLIS_NO_TRANSPOSE, \
kc_cur, mc_cur, NR, \
one, \
b_ic, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
&mem_b, \
thread \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_ic_use = b_use; \
\
/* Embed the panel stride of B within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of B. */ \
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
\
\
/* Loop over the m dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -651,10 +773,10 @@ void PASTEMAC(ch,varname) \
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
kc_cur, \
alpha_cast, \
a_jr, rs_a, cs_a, \
b_ic, rs_b, cs_b, \
a_jr, rs_a_use, cs_a_use, \
b_ic_use, rs_b_use, cs_b_use, \
beta_use, \
c_jr, rs_c, cs_c, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
@@ -757,6 +879,9 @@ void bli_gemmsup_ref_var2m
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const bool_t packa = bli_rntm_pack_a( rntm );
const bool_t packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
@@ -824,6 +949,8 @@ void bli_gemmsup_ref_var2m
// Invoke the function.
f
(
packa,
packb,
conja,
conjb,
m,
@@ -845,6 +972,8 @@ void bli_gemmsup_ref_var2m
// Invoke the function (transposing the operation).
f
(
packb, // swap the pack values.
packa,
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
@@ -887,6 +1016,8 @@ void PASTEMAC(ch,varname) \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
@@ -908,8 +1039,6 @@ void PASTEMAC(ch,varname) \
} \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -972,6 +1101,8 @@ void PASTEMAC(ch,varname) \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
/*
const inc_t jrstep_b = cs_b * NR; \
( void )jrstep_b; \
\
@@ -1051,6 +1182,45 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Prepare the packing destination buffer. If packing is not requested for
matrix A, this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_a) \
( \
packa, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
stor_id, \
MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
cntx, \
rntm, \
&mem_a, \
thread \
); \
\
/* Prepare the packing destination buffer. If packing is not requested for
matrix B, this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_b) \
( \
packb, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
stor_id, \
KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
cntx, \
rntm, \
&mem_b, \
thread \
); \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
@@ -1151,6 +1321,39 @@ void PASTEMAC(ch,varname) \
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then a_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
stor_id, \
BLIS_NO_TRANSPOSE, \
kc_cur, nc_cur, NR, \
one, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
&mem_b, \
thread \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* We don't need to embed the panel stride of B within the auxinfo_t
object because this variant iterates through B in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
\
/* Loop over the m dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
@@ -1234,6 +1437,38 @@ void PASTEMAC(ch,varname) \
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
stor_id, \
BLIS_NO_TRANSPOSE, \
mc_cur, kc_cur, MR, \
one, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
&mem_a, \
thread \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Embed the panel stride of A within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of A (if needed). */ \
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
@@ -1263,10 +1498,10 @@ void PASTEMAC(ch,varname) \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ic, rs_a, cs_a, \
b_jr, rs_b, cs_b, \
a_ic_use, rs_a_use, cs_a_use, \
b_jr, rs_b_use, cs_b_use, \
beta_use, \
c_jr, rs_c, cs_c, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \

View File

@@ -0,0 +1,821 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmsup_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
stor3_t eff_id,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
cntl_t* restrict cntl,
thrinfo_t* restrict thread
);
//
// -- var1n --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
void bli_gemmsup_ref_var1n
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var1n[dt_exec];
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
cntl,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
cntl,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
) \
{ \
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* This transposition of the stor3_t id value is inherent to variant 1.
The reason: we assume that variant 2 is the "main" variant. The
consequence of this is that we assume that the millikernels that
iterate over m are registered to the kernel group associated with
the kernel preference. So, regardless of whether the mkernels are
row- or column-preferential, millikernels that iterate over n are
always placed in the slots for the opposite kernel group. */ \
stor_id = bli_stor3_trans( stor_id ); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( FALSE ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
\
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
\
/* Query the maximum blocksize for MR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
const dim_t MRE = MRM - MR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = rs_c * NC; \
const inc_t jcstep_a = rs_a * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = cs_c * MC; \
const inc_t icstep_b = cs_b * MC; \
\
const inc_t jrstep_c = rs_c * MR; \
const inc_t jrstep_a = rs_a * MR; \
\
/*
const inc_t irstep_c = cs_c * NR; \
const inc_t irstep_b = cs_b * NR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = m / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( m + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( m + NC - 1 ) / NC; \
const dim_t jc_left = m % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( n + MC - 1 ) / MC; \
const dim_t ic_left = n % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
/*
const dim_t ir_inc = 1; \
*/ \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
dim_t jr_left = nc_cur % MR; \
\
/* An optimization: allow the last jr iteration to contain up to MRE
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. */ \
if ( 1 ) \
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
{ \
jr_iter--; jr_left += MR; \
} \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the n dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict b_ic = b_pc + ii * icstep_b; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
const dim_t ir_left = mc_cur % NR; \
*/ \
\
/* Loop over the m dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
\
ctype* restrict a_jr = a_pc + j * jrstep_a; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the n dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
nr_cur, /* Notice: nr_cur <= MR. */ \
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
kc_cur, \
alpha_cast, \
a_jr, rs_a, cs_a, \
b_ic, rs_b, cs_b, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
//
// -- var2m --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
void bli_gemmsup_ref_var2m
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var2m[dt_exec];
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
cntl,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
cntl,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
) \
{ \
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
\
/* Query the maximum blocksize for NR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
const dim_t NRE = NRM - NR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c * NC; \
const inc_t jcstep_b = cs_b * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = rs_c * MC; \
const inc_t icstep_a = rs_a * MC; \
\
const inc_t jrstep_c = cs_c * NR; \
const inc_t jrstep_b = cs_b * NR; \
\
/*
const inc_t irstep_c = rs_c * MR; \
const inc_t irstep_a = rs_a * MR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = n / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( n + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( n + NC - 1 ) / NC; \
const dim_t jc_left = n % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( m + MC - 1 ) / MC; \
const dim_t ic_left = m % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
/*
const dim_t ir_inc = 1; \
*/ \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* An optimization: allow the last jr iteration to contain up to NRE
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. */ \
if ( 1 ) \
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
{ \
jr_iter--; jr_left += NR; \
} \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
const dim_t ir_left = mc_cur % MR; \
*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc + j * jrstep_b; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mc_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ic, rs_a, cs_a, \
b_jr, rs_b, cs_b, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )

View File

@@ -959,8 +959,7 @@ void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
// Process each operation id tuple provided.
for ( i = 0; i < n_ops; ++i )
{
// Read the current ukernel id, ukernel datatype, and ukernel function
// pointer.
// Read the current operation id and handler function pointer.
const opid_t op_id = op_ids[ i ];
void* op_fp = op_fps[ i ];

95
frame/base/bli_env.c Normal file
View File

@@ -0,0 +1,95 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -----------------------------------------------------------------------------
dim_t bli_env_get_var( const char* env, dim_t fallback )
{
dim_t r_val;
char* str;
// Query the environment variable and store the result in str.
str = getenv( env );
// Set the return value based on the string obtained from getenv().
if ( str != NULL )
{
// If there was no error, convert the string to an integer and
// prepare to return that integer.
r_val = strtol( str, NULL, 10 );
}
else
{
// If there was an error, use the "fallback" as the return value.
r_val = fallback;
}
return r_val;
}
#if 0
void bli_env_set_var( const char* env, dim_t value )
{
dim_t r_val;
char value_str[32];
const char* fs_32 = "%u";
const char* fs_64 = "%lu";
// Convert the string to an integer, but vary the format specifier
// depending on the integer type size.
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
else sprintf( value_str, fs_64, value );
// Set the environment variable using the string we just wrote to via
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
// value if the environment variable already exists.)
r_val = bli_setenv( env, value_str, TRUE );
// Check the return value in case something went horribly wrong.
if ( r_val == -1 )
{
char err_str[128];
// Query the human-readable error string corresponding to errno.
strerror_r( errno, err_str, 128 );
// Print the error message.
bli_print_msg( err_str, __FILE__, __LINE__ );
}
}
#endif

44
frame/base/bli_env.h Normal file
View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_ENV_H
#define BLIS_ENV_H
dim_t bli_env_get_var( const char* env, dim_t fallback );
//void bli_env_set_var( const char* env, dim_t value );
#endif

View File

@@ -34,11 +34,32 @@
*/
#ifndef BLIS_MEM_H
#define BLIS_MEM_H
// Mem entry query
// mem_t object type (defined in bli_type_defs.h)
/*
typedef struct mem_s
{
pblk_t pblk;
packbuf_t buf_type;
pool_t* pool;
siz_t size;
} mem_t;
typedef struct
{
void* buf;
siz_t block_size;
} pblk_t;
*/
//
// -- mem_t query --------------------------------------------------------------
//
BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
{
@@ -78,7 +99,9 @@ BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
}
// Mem entry modification
//
// -- mem_t modification -------------------------------------------------------
//
BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
{

157
frame/base/bli_pack.c Normal file
View File

@@ -0,0 +1,157 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
extern rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm. (The definition
// resides in bli_rntm.c.)
extern bli_pthread_mutex_t global_rntm_mutex;
// -----------------------------------------------------------------------------
void bli_pack_init( void )
{
// Read the environment variables and use them to initialize the
// global runtime object.
bli_pack_init_rntm_from_env( &global_rntm );
}
void bli_pack_finalize( void )
{
}
// -----------------------------------------------------------------------------
dim_t bli_pack_get_pack_a( void )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
return bli_rntm_pack_a( &global_rntm );
}
// -----------------------------------------------------------------------------
dim_t bli_pack_get_pack_b( void )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
return bli_rntm_pack_b( &global_rntm );
}
// ----------------------------------------------------------------------------
void bli_pack_set_pack_a( bool_t pack_a )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
bli_rntm_set_pack_a( pack_a, &global_rntm );
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// ----------------------------------------------------------------------------
void bli_pack_set_pack_b( bool_t pack_b )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
bli_rntm_set_pack_a( pack_b, &global_rntm );
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// ----------------------------------------------------------------------------
void bli_pack_init_rntm_from_env
(
rntm_t* rntm
)
{
// NOTE: We don't need to acquire the global_rntm_mutex here because this
// function is only called from bli_pack_init(), which is only called
// by bli_init_once().
bool_t pack_a;
bool_t pack_b;
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
// -1 if it is unset.
pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
// Enforce the default behavior first, then check for affirmative FALSE, and
// finally assume anything else is TRUE.
if ( pack_a == -1 ) pack_a = FALSE; // default behavior
else if ( pack_a == 0 ) pack_a = FALSE; // zero is FALSE
else pack_a = TRUE; // anything else is TRUE
if ( pack_b == -1 ) pack_b = FALSE; // default behavior
else if ( pack_b == 0 ) pack_b = FALSE; // zero is FALSE
else pack_b = TRUE; // anything else is TRUE
#else
pack_a = TRUE;
pack_b = TRUE;
#endif
// Save the results back in the runtime object.
bli_rntm_set_pack_a( pack_a, rntm );
bli_rntm_set_pack_b( pack_b, rntm );
#if 0
printf( "bli_pack_init_rntm_from_env()\n" );
bli_rntm_print( rntm );
#endif
}

49
frame/base/bli_pack.h Normal file
View File

@@ -0,0 +1,49 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_PACK_H
#define BLIS_PACK_H
void bli_pack_init( void );
void bli_pack_finalize( void );
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a );
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b );
void bli_pack_init_rntm_from_env( rntm_t* rntm );
#endif

View File

@@ -34,6 +34,29 @@
#include "blis.h"
// The global rntm_t structure, which holds the global thread settings
// along with a few other key parameters.
rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm.
bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
// ----------------------------------------------------------------------------
void bli_rntm_init_from_global( rntm_t* rntm )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
*rntm = global_rntm;
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// -----------------------------------------------------------------------------
void bli_rntm_set_ways_for_op

View File

@@ -52,11 +52,8 @@ typedef struct rntm_s
bool l3_sup;
pool_t* sba_pool;
membrk_t* membrk;
bool_t l3_sup;
} rntm_t;
*/
@@ -229,10 +226,6 @@ BLIS_INLINE void bli_rntm_clear_membrk( rntm_t* rntm )
{
bli_rntm_set_membrk( NULL, rntm );
}
static void bli_rntm_clear_l3_sup( rntm_t* rntm )
{
bli_rntm_set_l3_sup( 1, rntm );
}
//
// -- rntm_t modification (public API) -----------------------------------------
@@ -321,7 +314,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
.l3_sup = TRUE, \
.sba_pool = NULL, \
.membrk = NULL, \
.l3_sup = 1 \
} \
BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
@@ -330,11 +322,12 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
bli_rntm_clear_num_threads_only( rntm );
bli_rntm_clear_ways_only( rntm );
bli_rntm_clear_pack_a( rntm );
bli_rntm_clear_pack_b( rntm );
bli_rntm_clear_l3_sup( rntm );
bli_rntm_clear_sba_pool( rntm );
bli_rntm_clear_membrk( rntm );
bli_rntm_clear_l3_sup( rntm );
}
// -- rntm_t total thread calculation ------------------------------------------
@@ -359,6 +352,8 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
// Function prototypes
BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
(
opid_t l3_op,

View File

@@ -1185,6 +1185,13 @@ typedef struct
inc_t is_a;
inc_t is_b;
// The panel strides of A and B.
// NOTE: These are only used in situations where iteration over the
// micropanels takes place in part within the kernel code (e.g. sup
// millikernels).
inc_t ps_a;
inc_t ps_b;
// The type to convert to on output.
//num_t dt_on_output;
@@ -1441,6 +1448,9 @@ typedef struct cntx_s
// -- Runtime type --
// NOTE: The order of these fields must be kept consistent with the definition
// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h.
typedef struct rntm_s
{
// "External" fields: these may be queried by the end-user.
@@ -1460,9 +1470,6 @@ typedef struct rntm_s
// The packing block allocator, which is attached in the l3 thread decorator.
membrk_t* membrk;
// A switch to enable/disable small/unpacked matrix handling in level-3 ops.
bool_t l3_sup;
} rntm_t;

View File

@@ -130,6 +130,8 @@ extern "C" {
#include "bli_getopt.h"
#include "bli_opid.h"
#include "bli_cntl.h"
#include "bli_env.h"
#include "bli_pack.h"
#include "bli_info.h"
#include "bli_arch.h"
#include "bli_cpuid.h"

View File

@@ -98,8 +98,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Some induced methods execute in multiple "stages". */ \
for ( i = 0; i < nstage; ++i ) \
@@ -191,8 +191,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Some induced methods execute in multiple "stages". */ \
for ( i = 0; i < nstage; ++i ) \
@@ -282,8 +282,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Some induced methods execute in multiple "stages". */ \
for ( i = 0; i < nstage; ++i ) \
@@ -358,8 +358,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Some induced methods execute in multiple "stages". */ \
for ( i = 0; i < nstage; ++i ) \
@@ -420,8 +420,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
{ \
/* NOTE: trsm cannot be implemented via any induced method that

View File

@@ -60,8 +60,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
func( alpha, a, b, beta, c, cntx, rntm ); \
}
@@ -97,8 +97,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
func( side, alpha, a, b, beta, c, cntx, rntm ); \
}
@@ -131,8 +131,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
func( alpha, a, beta, c, cntx, rntm ); \
}
@@ -164,8 +164,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
func( side, alpha, a, b, cntx, rntm ); \
}

View File

@@ -66,8 +66,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
@@ -112,8 +112,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
@@ -150,8 +150,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
@@ -187,8 +187,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \
@@ -223,8 +223,8 @@ void PASTEMAC(opname,imeth) \
/* Initialize a local runtime with global settings if necessary. Note
that in the case that a runtime is passed in, we make a local copy. */ \
rntm_t rntm_l; \
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
else { rntm_l = *rntm; rntm = &rntm_l; } \
\
/* Invoke the operation's front end. */ \
PASTEMAC(opname,_front) \

View File

@@ -0,0 +1,77 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_DECOR_H
#define BLIS_L3_DECOR_H
// -- conventional definitions -------------------------------------------------
// Level-3 internal function type.
typedef void (*l3int_t)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
// Level-3 thread decorator prototype.
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
// Include definitions specific to the method of multithreading for the
// conventional code path.
#include "bli_l3_decor_single.h"
#include "bli_l3_decor_openmp.h"
#include "bli_l3_decor_pthreads.h"
#endif

View File

@@ -0,0 +1,248 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_OPENMP
// Define a dummy function bli_l3_thread_entry(), which is needed in the
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
// or no multithreading) we don't risk having an unresolved symbol.
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
//#define PRINT_THRINFO
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
#ifdef PRINT_THRINFO
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
#endif
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
obj_t a_t, b_t, c_t;
cntl_t* cntl_use;
thrinfo_t* thread;
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );
bli_obj_alias_to( b, &b_t );
bli_obj_alias_to( c, &c_t );
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#if 1
func
(
alpha,
&a_t,
&b_t,
beta,
&c_t,
cntx,
rntm_p,
cntl_use,
thread
);
#else
bli_thrinfo_grow_tree
(
rntm_p,
cntl_use,
thread
);
#endif
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
#ifdef PRINT_THRINFO
threads[tid] = thread;
#else
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
#ifdef PRINT_THRINFO
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
else bli_l3_thrinfo_print_trsm_paths( threads );
exit(1);
#endif
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
// -----------------------------------------------------------------------------
void bli_l3_thread_decorator_thread_check
(
dim_t n_threads,
dim_t tid,
thrcomm_t* gl_comm,
rntm_t* rntm
)
{
dim_t n_threads_real = omp_get_num_threads();
// Check if the number of OpenMP threads created within this parallel
// region is different from the number of threads that were requested
// of BLIS. This inequality may trigger when, for example, the
// following conditions are satisfied:
// - an application is executing an OpenMP parallel region in which
// BLIS is invoked,
// - BLIS is configured for multithreading via OpenMP,
// - OMP_NUM_THREADS = t > 1,
// - the number of threads requested of BLIS (regardless of method)
// is p <= t,
// - OpenMP nesting is disabled.
// In this situation, the application spawns t threads. Each application
// thread calls gemm (for example). Each gemm will attempt to spawn p
// threads via OpenMP. However, since nesting is disabled, the OpenMP
// implementation finds that t >= p threads are already spawned, and
// thus it doesn't spawn *any* additional threads for each gemm.
if ( n_threads_real != n_threads )
{
// If the number of threads active in the current region is not
// equal to the number requested of BLIS, we then only continue
// if the number of threads in the current region is 1. If, for
// example, BLIS requested 4 threads but only got 3, then we
// abort().
//if ( tid == 0 )
//{
if ( n_threads_real != 1 )
{
bli_print_msg( "A different number of threads was "
"created than was requested.",
__FILE__, __LINE__ );
bli_abort();
}
//n_threads = 1; // not needed since it has no effect?
bli_thrcomm_init( 1, gl_comm );
bli_rntm_set_num_threads_only( 1, rntm );
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
//}
// Synchronize all threads and continue.
_Pragma( "omp barrier" )
}
}
#endif

View File

@@ -0,0 +1,53 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_DECOR_OPENMP_H
#define BLIS_L3_DECOR_OPENMP_H
// Definitions specific to situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
void bli_l3_thread_decorator_thread_check
(
dim_t n_threads,
dim_t tid,
thrcomm_t* gl_comm,
rntm_t* rntm
);
#endif
#endif

View File

@@ -0,0 +1,252 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
// A data structure to assist in passing operands to additional threads.
typedef struct thread_data
{
l3int_t func;
opid_t family;
pack_t schema_a;
pack_t schema_b;
obj_t* alpha;
obj_t* a;
obj_t* b;
obj_t* beta;
obj_t* c;
cntx_t* cntx;
rntm_t* rntm;
cntl_t* cntl;
dim_t tid;
thrcomm_t* gl_comm;
array_t* array;
} thread_data_t;
// Entry point for additional threads
void* bli_l3_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
l3int_t func = data->func;
opid_t family = data->family;
pack_t schema_a = data->schema_a;
pack_t schema_b = data->schema_b;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* b = data->b;
obj_t* beta = data->beta;
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
rntm_t* rntm = data->rntm;
cntl_t* cntl = data->cntl;
dim_t tid = data->tid;
array_t* array = data->array;
thrcomm_t* gl_comm = data->gl_comm;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
obj_t a_t, b_t, c_t;
cntl_t* cntl_use;
thrinfo_t* thread;
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );
bli_obj_alias_to( b, &b_t );
bli_obj_alias_to( c, &c_t );
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
func
(
alpha,
&a_t,
&b_t,
beta,
&c_t,
cntx,
rntm_p,
cntl_use,
thread
);
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
return NULL;
}
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// Query the total number of threads from the context.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
{
// Set up thread data for additional threads (beyond thread 0).
datas[tid].func = func;
datas[tid].family = family;
datas[tid].schema_a = schema_a;
datas[tid].schema_b = schema_b;
datas[tid].alpha = alpha;
datas[tid].a = a;
datas[tid].b = b;
datas[tid].beta = beta;
datas[tid].c = c;
datas[tid].cntx = cntx;
datas[tid].rntm = rntm;
datas[tid].cntl = cntl;
datas[tid].tid = tid;
datas[tid].gl_comm = gl_comm;
datas[tid].array = array;
// Spawn additional threads for ids greater than 1.
if ( tid != 0 )
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
else
bli_l3_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( dim_t tid = 1; tid < n_threads; tid++ )
{
bli_pthread_join( pthreads[tid], NULL );
}
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( pthreads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( datas );
}
#endif

View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_DECOR_PTHREADS_H
#define BLIS_L3_DECOR_PTHREADS_H
// Definitions specific to situations when POSIX multithreading is enabled.
#ifdef BLIS_ENABLE_PTHREADS
// Thread entry point prototype.
void* bli_l3_thread_entry( void* data_void );
#endif
#endif

View File

@@ -0,0 +1,150 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifndef BLIS_ENABLE_MULTITHREADING
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use;
thrinfo_t* thread;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
a, b, c, rntm_p, cntl, &cntl_use );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_DECOR_SINGLE_H
#define BLIS_L3_DECOR_SINGLE_H
// Definitions specific to situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
#endif
#endif

View File

@@ -0,0 +1,78 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_DECOR_H
#define BLIS_L3_SUP_DECOR_H
// -- sup definitions ----------------------------------------------------------
// Level-3 sup internal function type.
typedef err_t (*l3supint_t)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
// Level-3 sup thread decorator prototype.
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
// Include definitions specific to the method of multithreading for the
// sup code path.
#include "bli_l3_sup_decor_single.h"
#include "bli_l3_sup_decor_openmp.h"
#include "bli_l3_sup_decor_pthreads.h"
#endif

View File

@@ -0,0 +1,190 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_OPENMP
// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
// or no multithreading) we don't risk having an unresolved symbol.
//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
#if 0
return
bli_gemmsup_int
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
0
);
#else
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
//pack_t schema_a = bli_obj_pack_schema( a );
//pack_t schema_b = bli_obj_pack_schema( b );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
#if 0
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
( void )tid;
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
#endif
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
#define BLIS_L3_SUP_DECOR_OPENMP_H
// Definitions specific to situations when OpenMP multithreading is enabled.
#ifdef BLIS_ENABLE_OPENMP
#endif
#endif

View File

@@ -0,0 +1,183 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_PTHREADS
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
#if 0
return
bli_gemmsup_int
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
0
);
#else
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
//pack_t schema_a = bli_obj_pack_schema( a );
//pack_t schema_b = bli_obj_pack_schema( b );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
#if 0
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
( void )tid;
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
#endif
}
#endif

View File

@@ -0,0 +1,47 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H
#define BLIS_L3_SUP_DECOR_PTHREADS_H
// Definitions specific to situations when POSIX multithreading is enabled.
#ifdef BLIS_ENABLE_PTHREADS
// Thread entry point prototype.
void* bli_l3_sup_thread_entry( void* data_void );
#endif
#endif

View File

@@ -0,0 +1,183 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifndef BLIS_ENABLE_MULTITHREADING
err_t bli_l3_sup_thread_decorator
(
l3supint_t func,
opid_t family,
//pack_t schema_a,
//pack_t schema_b,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
#if 0
return
bli_gemmsup_int
(
alpha,
a,
b,
beta,
c,
cntx,
rntm,
0
);
#else
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
//pack_t schema_a = bli_obj_pack_schema( a );
//pack_t schema_b = bli_obj_pack_schema( b );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
#if 0
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
#endif
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use = NULL;
//thrinfo_t* thread = NULL;
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
//bli_l3_cntl_create_if( family, schema_a, schema_b,
// a, b, c, rntm_p, cntl, &cntl_use );
#if 0
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#endif
( void )tid;
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
#if 0
// Free the thread's local control tree.
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
return BLIS_SUCCESS;
#endif
}
#endif

View File

@@ -0,0 +1,44 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_DECOR_SINGLE_H
#define BLIS_L3_SUP_DECOR_SINGLE_H
// Definitions specific to situations when multithreading is disabled.
#ifndef BLIS_ENABLE_MULTITHREADING
#endif
#endif

View File

@@ -43,10 +43,6 @@
#include "bli_thrcomm_pthreads.h"
// thread entry point prototype.
void* bli_l3_thread_entry( void* data_void );
// thrcomm_t query (field only)
BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )

View File

@@ -214,212 +214,5 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
#endif
// Define a dummy function bli_l3_thread_entry(), which is needed in the
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
// or no multithreading) we don't risk having an unresolved symbol.
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
//#define PRINT_THRINFO
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// Query the total number of threads from the rntm_t object.
const dim_t n_threads = bli_rntm_num_threads( rntm );
#ifdef PRINT_THRINFO
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
#endif
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
_Pragma( "omp parallel num_threads(n_threads)" )
{
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Query the thread's id from OpenMP.
const dim_t tid = omp_get_thread_num();
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
obj_t a_t, b_t, c_t;
cntl_t* cntl_use;
thrinfo_t* thread;
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );
bli_obj_alias_to( b, &b_t );
bli_obj_alias_to( c, &c_t );
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
#if 1
func
(
alpha,
&a_t,
&b_t,
beta,
&c_t,
cntx,
rntm_p,
cntl_use,
thread
);
#else
bli_thrinfo_grow_tree
(
rntm_p,
cntl_use,
thread
);
#endif
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
#ifdef PRINT_THRINFO
threads[tid] = thread;
#else
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
#endif
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
#ifdef PRINT_THRINFO
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
else bli_l3_thrinfo_print_trsm_paths( threads );
exit(1);
#endif
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
// -----------------------------------------------------------------------------
void bli_l3_thread_decorator_thread_check
(
dim_t n_threads,
dim_t tid,
thrcomm_t* gl_comm,
rntm_t* rntm
)
{
dim_t n_threads_real = omp_get_num_threads();
// Check if the number of OpenMP threads created within this parallel
// region is different from the number of threads that were requested
// of BLIS. This inequality may trigger when, for example, the
// following conditions are satisfied:
// - an application is executing an OpenMP parallel region in which
// BLIS is invoked,
// - BLIS is configured for multithreading via OpenMP,
// - OMP_NUM_THREADS = t > 1,
// - the number of threads requested of BLIS (regardless of method)
// is p <= t,
// - OpenMP nesting is disabled.
// In this situation, the application spawns t threads. Each application
// thread calls gemm (for example). Each gemm will attempt to spawn p
// threads via OpenMP. However, since nesting is disabled, the OpenMP
// implementation finds that t >= p threads are already spawned, and
// thus it doesn't spawn *any* additional threads for each gemm.
if ( n_threads_real != n_threads )
{
// If the number of threads active in the current region is not
// equal to the number requested of BLIS, we then only continue
// if the number of threads in the current region is 1. If, for
// example, BLIS requested 4 threads but only got 3, then we
// abort().
//if ( tid == 0 )
//{
if ( n_threads_real != 1 )
{
bli_print_msg( "A different number of threads was "
"created than was requested.",
__FILE__, __LINE__ );
bli_abort();
}
//n_threads = 1; // not needed since it has no effect?
bli_thrcomm_init( 1, gl_comm );
bli_rntm_set_num_threads_only( 1, rntm );
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
//}
// Synchronize all threads and continue.
_Pragma( "omp barrier" )
}
}
#endif

View File

@@ -85,14 +85,6 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier );
void bli_thrcomm_tree_barrier( barrier_t* barack );
#endif
void bli_l3_thread_decorator_thread_check
(
dim_t n_threads,
dim_t tid,
thrcomm_t* gl_comm,
rntm_t* rntm
);
#endif
#endif

View File

@@ -138,217 +138,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
#endif
// A data structure to assist in passing operands to additional threads.
typedef struct thread_data
{
l3int_t func;
opid_t family;
pack_t schema_a;
pack_t schema_b;
obj_t* alpha;
obj_t* a;
obj_t* b;
obj_t* beta;
obj_t* c;
cntx_t* cntx;
rntm_t* rntm;
cntl_t* cntl;
dim_t tid;
thrcomm_t* gl_comm;
array_t* array;
} thread_data_t;
// Entry point for additional threads
void* bli_l3_thread_entry( void* data_void )
{
thread_data_t* data = data_void;
l3int_t func = data->func;
opid_t family = data->family;
pack_t schema_a = data->schema_a;
pack_t schema_b = data->schema_b;
obj_t* alpha = data->alpha;
obj_t* a = data->a;
obj_t* b = data->b;
obj_t* beta = data->beta;
obj_t* c = data->c;
cntx_t* cntx = data->cntx;
rntm_t* rntm = data->rntm;
cntl_t* cntl = data->cntl;
dim_t tid = data->tid;
array_t* array = data->array;
thrcomm_t* gl_comm = data->gl_comm;
// Create a thread-local copy of the master thread's rntm_t. This is
// necessary since we want each thread to be able to track its own
// small block pool_t as it executes down the function stack.
rntm_t rntm_l = *rntm;
rntm_t* restrict rntm_p = &rntm_l;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
bli_sba_rntm_set_pool( tid, array, rntm_p );
obj_t a_t, b_t, c_t;
cntl_t* cntl_use;
thrinfo_t* thread;
// Alias thread-local copies of A, B, and C. These will be the objects
// we pass down the algorithmic function stack. Making thread-local
// alaises is highly recommended in case a thread needs to change any
// of the properties of an object without affecting other threads'
// objects.
bli_obj_alias_to( a, &a_t );
bli_obj_alias_to( b, &b_t );
bli_obj_alias_to( c, &c_t );
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
// Create the root node of the current thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
func
(
alpha,
&a_t,
&b_t,
beta,
&c_t,
cntx,
rntm_p,
cntl_use,
thread
);
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
return NULL;
}
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// Query the total number of threads from the context.
const dim_t n_threads = bli_rntm_num_threads( rntm );
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we have the rntm_t.sba_pool field
// initialized and ready for the global communicator creation below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm. This will be
// inherited by all of the child threads when they make local copies of
// the rntm below.
bli_membrk_rntm_set_membrk( rntm );
// Allocate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
// Allocate an array of pthread objects and auxiliary data structs to pass
// to the thread entry functions.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
// can spawn all other threads before proceeding with its own computation.
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
{
// Set up thread data for additional threads (beyond thread 0).
datas[tid].func = func;
datas[tid].family = family;
datas[tid].schema_a = schema_a;
datas[tid].schema_b = schema_b;
datas[tid].alpha = alpha;
datas[tid].a = a;
datas[tid].b = b;
datas[tid].beta = beta;
datas[tid].c = c;
datas[tid].cntx = cntx;
datas[tid].rntm = rntm;
datas[tid].cntl = cntl;
datas[tid].tid = tid;
datas[tid].gl_comm = gl_comm;
datas[tid].array = array;
// Spawn additional threads for ids greater than 1.
if ( tid != 0 )
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
else
bli_l3_thread_entry( ( void* )(&datas[0]) );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called from the thread entry function).
// Thread 0 waits for additional threads to finish.
for ( dim_t tid = 1; tid < n_threads; tid++ )
{
bli_pthread_join( pthreads[tid], NULL );
}
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( pthreads );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_l3_thread_decorator().pth: " );
#endif
bli_free_intl( datas );
}
#endif

View File

@@ -84,119 +84,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
return;
}
// Define a dummy function bli_l3_thread_entry(), which is needed in the
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
// or no multithreading) we don't risk having an unresolved symbol.
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
// This is part of a hack to support mixed domain in bli_gemm_front().
// Sometimes we need to specify a non-standard schema for A and B, and
// we decided to transmit them via the schema field in the obj_t's
// rather than pass them in as function parameters. Once the values
// have been read, we immediately reset them back to their expected
// values for unpacked objects.
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
// For sequential execution, we use only one thread.
const dim_t n_threads = 1;
// NOTE: The sba was initialized in bli_init().
// Check out an array_t from the small block allocator. This is done
// with an internal lock to ensure only one application thread accesses
// the sba at a time. bli_sba_checkout_array() will also automatically
// resize the array_t, if necessary.
array_t* restrict array = bli_sba_checkout_array( n_threads );
// Access the pool_t* for thread 0 and embed it into the rntm. We do
// this up-front only so that we can create the global comm below.
bli_sba_rntm_set_pool( 0, array, rntm );
// Set the packing block allocator field of the rntm.
bli_membrk_rntm_set_membrk( rntm );
// Allcoate a global communicator for the root thrinfo_t structures.
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
{
// NOTE: We don't need to create another copy of the rntm_t since
// it was already copied in one of the high-level oapi functions.
rntm_t* restrict rntm_p = rntm;
cntl_t* cntl_use;
thrinfo_t* thread;
const dim_t tid = 0;
// Use the thread id to access the appropriate pool_t* within the
// array_t, and use it to set the sba_pool field within the rntm_t.
// If the pool_t* element within the array_t is NULL, it will first
// be allocated/initialized.
// NOTE: This is commented out because, in the single-threaded case,
// this is redundant since it's already been done above.
//bli_sba_rntm_set_pool( tid, array, rntm_p );
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
// need to alias objects for A, B, and C since they were already aliased
// in bli_*_front(). However, we may add aliasing here in the future so
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
// consistently providing local aliases, we can then eliminate aliasing
// elsewhere.
// Create a default control tree for the operation, if needed.
bli_l3_cntl_create_if( family, schema_a, schema_b,
a, b, c, rntm_p, cntl, &cntl_use );
// Create the root node of the thread's thrinfo_t structure.
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
func
(
alpha,
a,
b,
beta,
c,
cntx,
rntm_p,
cntl_use,
thread
);
// Free the thread's local control tree.
bli_l3_cntl_free( rntm_p, cntl_use, thread );
// Free the current thread's thrinfo_t structure.
bli_l3_thrinfo_free( rntm_p, thread );
}
// We shouldn't free the global communicator since it was already freed
// by the global communicator's chief thread in bli_l3_thrinfo_free()
// (called above).
// Check the array_t back into the small block allocator. Similar to the
// check-out, this is done using a lock embedded within the sba to ensure
// mutual exclusion.
bli_sba_checkin_array( array );
}
#endif

View File

@@ -39,8 +39,12 @@ thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
thrinfo_t BLIS_GEMM_SINGLE_THREADED = {};
thrcomm_t BLIS_SINGLE_COMM = {};
// The global rntm_t structure, which holds the global thread settings.
static rntm_t global_rntm;
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
extern rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm. (The definition
// resides in bli_rntm.c.)
extern bli_pthread_mutex_t global_rntm_mutex;
// -----------------------------------------------------------------------------
@@ -1198,63 +1202,6 @@ dim_t bli_ipow( dim_t base, dim_t power )
return p;
}
// -----------------------------------------------------------------------------
dim_t bli_thread_get_env( const char* env, dim_t fallback )
{
dim_t r_val;
char* str;
// Query the environment variable and store the result in str.
str = getenv( env );
// Set the return value based on the string obtained from getenv().
if ( str != NULL )
{
// If there was no error, convert the string to an integer and
// prepare to return that integer.
r_val = strtol( str, NULL, 10 );
}
else
{
// If there was an error, use the "fallback" as the return value.
r_val = fallback;
}
return r_val;
}
#if 0
void bli_thread_set_env( const char* env, dim_t value )
{
dim_t r_val;
char value_str[32];
const char* fs_32 = "%u";
const char* fs_64 = "%lu";
// Convert the string to an integer, but vary the format specifier
// depending on the integer type size.
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
else sprintf( value_str, fs_64, value );
// Set the environment variable using the string we just wrote to via
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
// value if the environment variable already exists.)
r_val = bli_setenv( env, value_str, TRUE );
// Check the return value in case something went horribly wrong.
if ( r_val == -1 )
{
char err_str[128];
// Query the human-readable error string corresponding to errno.
strerror_r( errno, err_str, 128 );
// Print the error message.
bli_print_msg( err_str, __FILE__, __LINE__ );
}
}
#endif
// -----------------------------------------------------------------------------
@@ -1308,9 +1255,6 @@ dim_t bli_thread_get_num_threads( void )
// ----------------------------------------------------------------------------
// A mutex to allow synchronous access to global_rntm.
static bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
{
// We must ensure that global_rntm has been initialized.
@@ -1341,22 +1285,6 @@ void bli_thread_set_num_threads( dim_t n_threads )
// ----------------------------------------------------------------------------
void bli_thread_init_rntm( rntm_t* rntm )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
*rntm = global_rntm;
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// ----------------------------------------------------------------------------
void bli_thread_init_rntm_from_env
(
rntm_t* rntm
@@ -1373,19 +1301,19 @@ void bli_thread_init_rntm_from_env
#ifdef BLIS_ENABLE_MULTITHREADING
// Try to read BLIS_NUM_THREADS first.
nt = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
if ( nt == -1 )
nt = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
// Read the environment variables for the number of threads (ways
// of parallelism) for each individual loop.
jc = bli_thread_get_env( "BLIS_JC_NT", -1 );
pc = bli_thread_get_env( "BLIS_PC_NT", -1 );
ic = bli_thread_get_env( "BLIS_IC_NT", -1 );
jr = bli_thread_get_env( "BLIS_JR_NT", -1 );
ir = bli_thread_get_env( "BLIS_IR_NT", -1 );
jc = bli_env_get_var( "BLIS_JC_NT", -1 );
pc = bli_env_get_var( "BLIS_PC_NT", -1 );
ic = bli_env_get_var( "BLIS_IC_NT", -1 );
jr = bli_env_get_var( "BLIS_JR_NT", -1 );
ir = bli_env_get_var( "BLIS_IR_NT", -1 );
// If any BLIS_*_NT environment variable was set, then we ignore the
// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the

View File

@@ -49,6 +49,14 @@
#include "bli_packm_thrinfo.h"
#include "bli_l3_thrinfo.h"
// Include the level-3 thread decorator and related definitions and prototypes
// for the conventional code path.
#include "bli_l3_decor.h"
// Include the level-3 thread decorator and related definitions and prototypes
// for the sup code path.
#include "bli_l3_sup_decor.h"
// Initialization-related prototypes.
void bli_thread_init( void );
void bli_thread_finalize( void );
@@ -143,37 +151,6 @@ siz_t bli_thread_range_weighted_sub
dim_t* restrict j_end_thr
);
// Level-3 internal function type
typedef void (*l3int_t)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
// Level-3 thread decorator prototype
void bli_l3_thread_decorator
(
l3int_t func,
opid_t family,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
// Factorization and partitioning prototypes
@@ -205,9 +182,6 @@ dim_t bli_ipow( dim_t base, dim_t power );
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS dim_t bli_thread_get_env( const char* env, dim_t fallback );
//void bli_thread_set_env( const char* env, dim_t value );
BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );

View File

@@ -54,6 +54,12 @@
Therefore, this (r)ow-preferential microkernel is well-suited for
a dot-product-based accumulation that performs vector loads from
both A and B.
NOTE: These kernels implicitly support column-oriented IO, implemented
via an a high-level transposition of the entire operation. A and B will
effectively remain row- and column-stored, respectively, but C will then
effectively appear column-stored. Thus, this kernel may be used for both
rrc and crc cases.
*/
// Prototype reference microkernels.

View File

@@ -54,6 +54,12 @@
Therefore, this (r)ow-preferential microkernel is well-suited for
a dot-product-based accumulation that performs vector loads from
both A and B.
NOTE: These kernels implicitly support column-oriented IO, implemented
via an a high-level transposition of the entire operation. A and B will
effectively remain row- and column-stored, respectively, but C will then
effectively appear column-stored. Thus, this kernel may be used for both
rrc and crc cases.
*/
// Prototype reference microkernels.

View File

@@ -156,12 +156,44 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
beta, cij, rs_c0, cs_c0, data, cntx
);
#else
bli_dgemv_ex
(
BLIS_NO_TRANSPOSE, conjb, m0, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
beta, cij, rs_c0, cntx, NULL
);
dim_t ps_a0 = bli_auxinfo_ps_a( data );
if ( ps_a0 == 6 * rs_a0 )
{
// Since A is not packed, we can use one gemv.
bli_dgemv_ex
(
BLIS_NO_TRANSPOSE, conjb, m0, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
beta, cij, rs_c0, cntx, NULL
);
}
else
{
const dim_t mr = 6;
// Since A is packed into row panels, we must use a loop over
// gemv.
dim_t m_iter = ( m0 + mr - 1 ) / mr;
dim_t m_left = m0 % mr;
double* restrict ai_ii = ai;
double* restrict cij_ii = cij;
for ( dim_t ii = 0; ii < m_iter; ii += 1 )
{
dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
? mr : m_left );
bli_dgemv_ex
(
BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
beta, cij_ii, rs_c0, cntx, NULL
);
cij_ii += mr*rs_c0; ai_ii += ps_a0;
}
}
#endif
}
return;
@@ -185,6 +217,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of A and convert it to units of bytes.
uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_a8 = ps_a * sizeof( double );
if ( m_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -819,8 +855,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
lea(mem(r12, rdi, 4), r12) //
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
lea(mem(r14, r8, 4), r14) //
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
//lea(mem(r14, r8, 4), r14) //
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
mov(var(ps_a8), rax) // load ps_a8
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
dec(r11) // ii -= 1;
jne(.DLOOP6X8I) // iterate again if ii != 0.
@@ -841,6 +879,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a8] "m" (ps_a8),
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
@@ -870,7 +909,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
const dim_t i_edge = m0 - ( dim_t )m_left;
double* restrict cij = c + i_edge*rs_c;
double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
double* restrict ai = a + m_iter * ps_a;
double* restrict bj = b;
#if 0
@@ -979,6 +1020,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of A and convert it to units of bytes.
uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_a8 = ps_a * sizeof( double );
if ( m_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -1591,8 +1636,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
lea(mem(r12, rdi, 4), r12) //
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
lea(mem(r14, r8, 4), r14) //
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
//lea(mem(r14, r8, 4), r14) //
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
mov(var(ps_a8), rax) // load ps_a8
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
dec(r11) // ii -= 1;
jne(.DLOOP6X8I) // iterate again if ii != 0.
@@ -1613,6 +1660,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a8] "m" (ps_a8),
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
@@ -1642,7 +1690,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
const dim_t i_edge = m0 - ( dim_t )m_left;
double* restrict cij = c + i_edge*rs_c;
double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
double* restrict ai = a + m_iter * ps_a;
double* restrict bj = b;
#if 0
@@ -1751,6 +1801,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of A and convert it to units of bytes.
uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_a8 = ps_a * sizeof( double );
if ( m_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -2241,8 +2295,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
lea(mem(r12, rdi, 4), r12) //
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
lea(mem(r14, r8, 4), r14) //
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
//lea(mem(r14, r8, 4), r14) //
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
mov(var(ps_a8), rax) // load ps_a8
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
dec(r11) // ii -= 1;
jne(.DLOOP6X4I) // iterate again if ii != 0.
@@ -2263,6 +2319,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a8] "m" (ps_a8),
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
@@ -2292,7 +2349,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
const dim_t i_edge = m0 - ( dim_t )m_left;
double* restrict cij = c + i_edge*rs_c;
double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
double* restrict ai = a + m_iter * ps_a;
double* restrict bj = b;
#if 0
@@ -2401,6 +2460,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of A and convert it to units of bytes.
uint64_t ps_a = bli_auxinfo_ps_a( data );
uint64_t ps_a8 = ps_a * sizeof( double );
if ( m_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -2867,8 +2930,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
lea(mem(r12, rdi, 4), r12) //
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
lea(mem(r14, r8, 4), r14) //
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
//lea(mem(r14, r8, 4), r14) //
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
mov(var(ps_a8), rax) // load ps_a8
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
dec(r11) // ii -= 1;
jne(.DLOOP6X2I) // iterate again if ii != 0.
@@ -2889,6 +2954,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
[a] "m" (a),
[rs_a] "m" (rs_a),
[cs_a] "m" (cs_a),
[ps_a8] "m" (ps_a8),
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
@@ -2918,7 +2984,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
const dim_t i_edge = m0 - ( dim_t )m_left;
double* restrict cij = c + i_edge*rs_c;
double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + i_edge*rs_a;
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
double* restrict ai = a + m_iter * ps_a;
double* restrict bj = b;
#if 0

View File

@@ -195,6 +195,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -853,6 +857,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -880,7 +885,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{
@@ -977,6 +984,10 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -1596,6 +1607,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -1623,7 +1635,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{
@@ -1720,6 +1734,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -2248,6 +2266,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -2275,7 +2294,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{
@@ -2363,6 +2384,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -2921,6 +2946,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -2948,7 +2974,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{
@@ -3036,6 +3064,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -3475,6 +3507,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -3502,7 +3535,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{
@@ -3590,6 +3625,10 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
uint64_t rs_c = rs_c0;
uint64_t cs_c = cs_c0;
// Query the panel stride of B and convert it to units of bytes.
uint64_t ps_b = bli_auxinfo_ps_b( data );
uint64_t ps_b8 = ps_b * sizeof( double );
if ( n_iter == 0 ) goto consider_edge_cases;
// -------------------------------------------------------------------------
@@ -3993,6 +4032,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
[b] "m" (b),
[rs_b] "m" (rs_b),
[cs_b] "m" (cs_b),
[ps_b8] "m" (ps_b8),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
@@ -4020,7 +4060,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
double* restrict cij = c + j_edge*cs_c;
double* restrict ai = a;
double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + j_edge*cs_b;
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
double* restrict bj = b + n_iter * ps_b;
if ( 6 <= n_left )
{

View File

@@ -829,12 +829,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
dim_t m = 1000, n = 1000, k = 1000;
bli_thread_init_rntm( &gemm );
bli_thread_init_rntm( &herk );
bli_thread_init_rntm( &trmm_l );
bli_thread_init_rntm( &trmm_r );
bli_thread_init_rntm( &trsm_l );
bli_thread_init_rntm( &trsm_r );
bli_rntm_init_from_global( &gemm );
bli_rntm_init_from_global( &herk );
bli_rntm_init_from_global( &trmm_l );
bli_rntm_init_from_global( &trmm_r );
bli_rntm_init_from_global( &trsm_l );
bli_rntm_init_from_global( &trsm_r );
bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT, m, n, k, &gemm );
bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT, m, n, k, &herk );