mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
"Merge Selective Packing code from amd branch flame/blis"
Change-Id: Ifbdf49735f56a66fbbc96dab6d3ca6069302daed
This commit is contained in:
committed by
dzambare
parent
307ddc3110
commit
6b5c68b9ed
@@ -176,6 +176,16 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Initialize the context with the sup handlers.
|
||||
bli_cntx_set_l3_sup_handlers
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM, bli_gemmsup_ref,
|
||||
cntx
|
||||
);
|
||||
#endif
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
|
||||
@@ -186,6 +186,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize the context with the sup handlers.
|
||||
bli_cntx_set_l3_sup_handlers
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM, bli_gemmsup_ref,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized small/unpacked gemm kernels.
|
||||
bli_cntx_set_l3_sup_kers
|
||||
(
|
||||
|
||||
@@ -73,7 +73,11 @@
|
||||
|
||||
// Prototype reference implementation of small/unpacked matrix handler.
|
||||
#include "bli_l3_sup_ref.h"
|
||||
#include "bli_l3_sup_int.h"
|
||||
#include "bli_l3_sup_vars.h"
|
||||
#include "bli_l3_sup_packm_a.h"
|
||||
#include "bli_l3_sup_packm_b.h"
|
||||
#include "bli_l3_sup_packm_var.h"
|
||||
|
||||
// Prototype microkernel wrapper APIs.
|
||||
#include "bli_l3_ukr_oapi.h"
|
||||
|
||||
@@ -104,14 +104,6 @@ err_t bli_gemmsup
|
||||
// that function assumes the context pointer is valid.
|
||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
#if 0
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
#endif
|
||||
|
||||
// Return early if a microkernel preference-induced transposition would
|
||||
// have been performed and shifted the dimensions outside of the space
|
||||
// of sup-handled problems.
|
||||
@@ -138,6 +130,12 @@ err_t bli_gemmsup
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize a local runtime with global settings if necessary. Note
|
||||
// that in the case that a runtime is passed in, we make a local copy.
|
||||
rntm_t rntm_l;
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||
|
||||
#if 0
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const dim_t m = bli_obj_length( c );
|
||||
|
||||
173
frame/3/bli_l3_sup_int.c
Normal file
173
frame/3/bli_l3_sup_int.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
err_t bli_gemmsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
//bli_gemmsup_ref_var2
|
||||
//bli_gemmsup_ref_var1
|
||||
#if 0
|
||||
bli_gemmsup_ref_var1n
|
||||
#else
|
||||
#endif
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
if ( is_rrr_rrc_rcr_crr )
|
||||
{
|
||||
bli_gemmsup_ref_var2m
|
||||
(
|
||||
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
bli_gemmsup_ref_var2m
|
||||
(
|
||||
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||
);
|
||||
}
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
#endif
|
||||
|
||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||
|
||||
// Don't use the small/unpacked implementation if one of the matrices
|
||||
// uses general stride.
|
||||
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
|
||||
|
||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||
stor_id == BLIS_RRC ||
|
||||
stor_id == BLIS_RCR ||
|
||||
stor_id == BLIS_CRR );
|
||||
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||
|
||||
const num_t dt = bli_obj_dt( c );
|
||||
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||
|
||||
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||
: is_rcc_crc_ccr_ccc );
|
||||
|
||||
if ( is_primary )
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for row-preferential kernels
|
||||
// - rcc crc ccr ccc for column-preferential kernels
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t mu = m / MR;
|
||||
const dim_t nu = n / NR;
|
||||
|
||||
if ( mu >= nu )
|
||||
//if ( m % 2 == 1 && n % 2 == 1 )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
}
|
||||
else // if ( mu < nu )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This branch handles:
|
||||
// - rrr rrc rcr crr for column-preferential kernels
|
||||
// - rcc crc ccr ccc for row-preferential kernels
|
||||
|
||||
const dim_t mt = bli_obj_width( c );
|
||||
const dim_t nt = bli_obj_length( c );
|
||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t mu = mt / MR;
|
||||
const dim_t nu = nt / NR;
|
||||
|
||||
if ( mu >= nu )
|
||||
//if ( mt % 2 == 1 && nt % 2 == 1 )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||
#endif
|
||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
}
|
||||
else // if ( mu < nu )
|
||||
{
|
||||
#ifdef TRACEVAR
|
||||
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||
#endif
|
||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
||||
alpha, a, b, beta, c,
|
||||
stor_id, cntx, rntm, cntl, thread );
|
||||
}
|
||||
// *requires nudging of mc,nc up to be a multiple of nr,mr.
|
||||
}
|
||||
|
||||
// Return success so that the caller knows that we computed the solution.
|
||||
return BLIS_SUCCESS;
|
||||
}
|
||||
|
||||
46
frame/3/bli_l3_sup_int.h
Normal file
46
frame/3/bli_l3_sup_int.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
err_t bli_gemmsup_int
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
115
frame/3/bli_l3_sup_packm_a.h
Normal file
115
frame/3/bli_l3_sup_packm_a.h
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t did_pack, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
stor3_t stor_id, \
|
||||
pack_t* restrict schema, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
dim_t* restrict m_max, \
|
||||
dim_t* restrict k_max, \
|
||||
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_init_a )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
dim_t mr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_a )
|
||||
|
||||
115
frame/3/bli_l3_sup_packm_b.h
Normal file
115
frame/3/bli_l3_sup_packm_b.h
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
packbuf_t pack_buf_type, \
|
||||
stor3_t stor_id, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t did_pack, \
|
||||
rntm_t* restrict rntm, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
stor3_t stor_id, \
|
||||
pack_t* restrict schema, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
dim_t* restrict k_max, \
|
||||
dim_t* restrict n_max, \
|
||||
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_init_b )
|
||||
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname) \
|
||||
( \
|
||||
bool_t will_pack, \
|
||||
stor3_t stor_id, \
|
||||
trans_t transc, \
|
||||
dim_t k, \
|
||||
dim_t n, \
|
||||
dim_t nr, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict x, inc_t rs_x, inc_t cs_x, \
|
||||
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||
inc_t* restrict ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
mem_t* restrict mem, \
|
||||
thrinfo_t* restrict thread \
|
||||
); \
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_b )
|
||||
|
||||
329
frame/3/bli_l3_sup_packm_var.c
Normal file
329
frame/3/bli_l3_sup_packm_var.c
Normal file
@@ -0,0 +1,329 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces to the variants.
|
||||
//
|
||||
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t n_iter; \
|
||||
dim_t it, ic; \
|
||||
dim_t ic0; \
|
||||
doff_t ic_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( &rs_c, &cs_c ); \
|
||||
bli_toggle_trans( &transc ); \
|
||||
} \
|
||||
\
|
||||
/* Create flags to incidate row or column storage. Note that the
|
||||
schema bit that encodes row or column is describing the form of
|
||||
micro-panel, not the storage in the micro-panel. Hence the
|
||||
mismatch in "row" and "column" semantics. */ \
|
||||
bool_t row_stored = bli_is_col_packed( schema ); \
|
||||
/*bool_t col_stored = bli_is_row_packed( schema );*/ \
|
||||
\
|
||||
/* If the row storage flag indicates row storage, then we are packing
|
||||
to column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( row_stored ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( col_stored ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
vs_c = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
} \
|
||||
\
|
||||
ctype* restrict p_begin = p_cast; \
|
||||
\
|
||||
/* Query the number of threads and thread ids from the current thread's
|
||||
packm thrinfo_t node. */ \
|
||||
const dim_t nt = bli_thread_n_way( thread ); \
|
||||
const dim_t tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||
( void )nt; \
|
||||
( void )tid; \
|
||||
\
|
||||
dim_t it_start, it_end, it_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment using the current thread's
|
||||
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||
will depend on whether slab or round-robin partitioning was requested
|
||||
at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||
\
|
||||
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||
for ( ic = ic0, it = 0; it < n_iter; \
|
||||
ic += ic_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
ctype* restrict c_use = c_begin; \
|
||||
ctype* restrict p_use = p_begin; \
|
||||
\
|
||||
{ \
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||
or round-robin partitioning was requested at configure-time. */ \
|
||||
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,packm_cxk) \
|
||||
( \
|
||||
conjc, \
|
||||
schema, \
|
||||
panel_dim_i, \
|
||||
panel_dim_max, \
|
||||
panel_len_i, \
|
||||
panel_len_max_i, \
|
||||
kappa_cast, \
|
||||
c_use, vs_c, ldc, \
|
||||
p_use, ldp, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ps_p; \
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
\
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
if ( !row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 )
|
||||
|
||||
|
||||
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
if ( col_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
else \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
|
||||
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
else { \
|
||||
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||
{ \
|
||||
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||
fflush( stdout ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
fflush( stdout ); \
|
||||
} \
|
||||
bli_thread_obarrier( thread ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema ) ) { \
|
||||
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||
if ( col_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
if ( row_stored ) { \
|
||||
if ( 0 ) \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/
|
||||
/*
|
||||
if ( row_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
inc_t is_b = rs_p * *m_panel_max; \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
/*
|
||||
if ( col_stored ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/
|
||||
60
frame/3/bli_l3_sup_packm_var.h
Normal file
60
frame/3/bli_l3_sup_packm_var.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces to the variants.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
trans_t transc, \
|
||||
pack_t schema, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p, \
|
||||
cntx_t* restrict cntx, \
|
||||
thrinfo_t* restrict thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( packm_sup_var1 )
|
||||
|
||||
@@ -45,6 +45,11 @@ err_t bli_gemmsup_ref
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// This function implements the default gemmsup handler. If you are a
|
||||
// BLIS developer and wish to use a different gemmsup handler, please
|
||||
// register a different function pointer in the context in your
|
||||
// sub-configuration's bli_cntx_init_*() function.
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
@@ -85,6 +90,14 @@ err_t bli_gemmsup_ref
|
||||
//bli_rntm_set_pack_a( 0, rntm );
|
||||
//bli_rntm_set_pack_b( 0, rntm );
|
||||
#endif
|
||||
//bli_rntm_set_pack_a( 0, rntm );
|
||||
//bli_rntm_set_pack_b( 0, rntm );
|
||||
|
||||
// May not need these here since packm_sup infers the schemas based
|
||||
// on the stor3_t id. (This would also mean that they don't need to
|
||||
// be passed into the thread decorator below.)
|
||||
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
|
||||
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
|
||||
|
||||
return
|
||||
bli_l3_sup_thread_decorator
|
||||
|
||||
@@ -119,6 +119,9 @@ void bli_gemmsup_ref_var1n
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
@@ -186,6 +189,8 @@ void bli_gemmsup_ref_var1n
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
packa,
|
||||
packb,
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
@@ -207,6 +212,8 @@ void bli_gemmsup_ref_var1n
|
||||
// Invoke the function (transposing the operation).
|
||||
f
|
||||
(
|
||||
packb,
|
||||
packa,
|
||||
conjb, // swap the conj values.
|
||||
conja,
|
||||
n, // swap the m and n dimensions.
|
||||
@@ -249,6 +256,8 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* If m or n is zero, return immediately. */ \
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
@@ -270,16 +279,16 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* This transposition of the stor3_t id value is inherent to variant 1.
|
||||
The reason: we assume that variant 2 is the "main" variant. The
|
||||
consequence of this is that we assume that the millikernels that
|
||||
iterate over m are registered to the kernel group associated with
|
||||
the kernel preference. So, regardless of whether the mkernels are
|
||||
row- or column-preferential, millikernels that iterate over n are
|
||||
always placed in the slots for the opposite kernel group. */ \
|
||||
iterate over m are registered to the "primary" kernel group associated
|
||||
with the kernel IO preference; similarly, mkernels that iterate over
|
||||
n are assumed to be registered to the "non-primary" group associated
|
||||
with the ("non-primary") anti-preference. Note that this pattern holds
|
||||
regardless of whether the mkernel set has a row or column preference.)
|
||||
See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
|
||||
stor_id = bli_stor3_trans( stor_id ); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
@@ -326,7 +335,9 @@ void PASTEMAC(ch,varname) \
|
||||
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||
} \
|
||||
\
|
||||
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
|
||||
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
|
||||
NOTE: This is unique to variant 1 (ie: not performed in variant 2)
|
||||
because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
|
||||
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
|
||||
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
|
||||
\
|
||||
@@ -346,7 +357,11 @@ void PASTEMAC(ch,varname) \
|
||||
const inc_t icstep_b = cs_b; \
|
||||
\
|
||||
const inc_t jrstep_c = rs_c * MR; \
|
||||
\
|
||||
/*
|
||||
const inc_t jrstep_a = rs_a * MR; \
|
||||
( void )jrstep_a; \
|
||||
*/ \
|
||||
\
|
||||
const inc_t irstep_c = cs_c * NR; \
|
||||
const inc_t irstep_b = cs_b * NR; \
|
||||
@@ -435,6 +450,45 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = m_local % NC; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. An alternative way of initializing the
|
||||
mem_t entries is:
|
||||
|
||||
bli_mem_clear( &mem_a ); \
|
||||
bli_mem_clear( &mem_b ); \
|
||||
*/ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
|
||||
stor_id, \
|
||||
NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
|
||||
stor_id, \
|
||||
KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -537,6 +591,39 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = n_local % MC; \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
stor_id, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
nc_cur, kc_cur, MR, \
|
||||
one, \
|
||||
a_pc, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict a_pc_use = a_use; \
|
||||
\
|
||||
/* We don't need to embed the panel stride of A within the auxinfo_t
|
||||
object because this variant iterates through A in the jr loop,
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
|
||||
\
|
||||
/* Loop over the n dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -622,6 +709,41 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then b_use will be set to point to
|
||||
b and the _b_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id.
|
||||
NOTE: packing matrix B in this panel-block algorithm corresponds
|
||||
to packing matrix A in the block-panel algorithm. */ \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
stor_id, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
kc_cur, mc_cur, NR, \
|
||||
one, \
|
||||
b_ic, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Alias b_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_ic_use = b_use; \
|
||||
\
|
||||
/* Embed the panel stride of B within the auxinfo_t object. The
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of B. */ \
|
||||
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
|
||||
\
|
||||
\
|
||||
/* Loop over the m dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
@@ -651,10 +773,10 @@ void PASTEMAC(ch,varname) \
|
||||
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
|
||||
kc_cur, \
|
||||
alpha_cast, \
|
||||
a_jr, rs_a, cs_a, \
|
||||
b_ic, rs_b, cs_b, \
|
||||
a_jr, rs_a_use, cs_a_use, \
|
||||
b_ic_use, rs_b_use, cs_b_use, \
|
||||
beta_use, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
@@ -757,6 +879,9 @@ void bli_gemmsup_ref_var2m
|
||||
const bool packa = bli_rntm_pack_a( rntm );
|
||||
const bool packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
@@ -824,6 +949,8 @@ void bli_gemmsup_ref_var2m
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
packa,
|
||||
packb,
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
@@ -845,6 +972,8 @@ void bli_gemmsup_ref_var2m
|
||||
// Invoke the function (transposing the operation).
|
||||
f
|
||||
(
|
||||
packb, // swap the pack values.
|
||||
packa,
|
||||
conjb, // swap the conj values.
|
||||
conja,
|
||||
n, // swap the m and n dimensions.
|
||||
@@ -887,6 +1016,8 @@ void PASTEMAC(ch,varname) \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* If m or n is zero, return immediately. */ \
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
@@ -908,8 +1039,6 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
@@ -972,6 +1101,8 @@ void PASTEMAC(ch,varname) \
|
||||
const inc_t icstep_a = rs_a; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
\
|
||||
/*
|
||||
const inc_t jrstep_b = cs_b * NR; \
|
||||
( void )jrstep_b; \
|
||||
\
|
||||
@@ -1051,6 +1182,45 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the JC loop. */ \
|
||||
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
|
||||
const dim_t jc_left = n_local % NC; \
|
||||
\
|
||||
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||
needed for the matrix we will be packing (if any), but we do it
|
||||
unconditionally to be safe. An alternative way of initializing the
|
||||
mem_t entries is:
|
||||
|
||||
bli_mem_clear( &mem_a ); \
|
||||
bli_mem_clear( &mem_b ); \
|
||||
*/ \
|
||||
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix A, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||
( \
|
||||
packa, \
|
||||
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
|
||||
stor_id, \
|
||||
MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Prepare the packing destination buffer. If packing is not requested for
|
||||
matrix B, this function will reduce to a no-op. */ \
|
||||
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||
( \
|
||||
packb, \
|
||||
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
|
||||
stor_id, \
|
||||
KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
|
||||
cntx, \
|
||||
rntm, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
|
||||
@@ -1151,6 +1321,39 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute number of primary and leftover components of the IC loop. */ \
|
||||
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
|
||||
const dim_t ic_left = m_local % MC; \
|
||||
\
|
||||
ctype* b_use; \
|
||||
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
B. (If B will not be packed, then a_use will be set to point to
|
||||
b and the _b_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
PASTEMAC(ch,packm_sup_b) \
|
||||
( \
|
||||
packb, \
|
||||
stor_id, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
kc_cur, nc_cur, NR, \
|
||||
one, \
|
||||
b_pc, rs_b, cs_b, \
|
||||
&b_use, &rs_b_use, &cs_b_use, \
|
||||
&ps_b_use, \
|
||||
cntx, \
|
||||
&mem_b, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix B. */ \
|
||||
ctype* restrict b_pc_use = b_use; \
|
||||
\
|
||||
/* We don't need to embed the panel stride of B within the auxinfo_t
|
||||
object because this variant iterates through B in the jr loop,
|
||||
which occurs here, within the macrokernel, not within the
|
||||
millikernel. */ \
|
||||
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
|
||||
@@ -1234,6 +1437,38 @@ void PASTEMAC(ch,varname) \
|
||||
/* Compute the JR loop thread range for the current thread. */ \
|
||||
dim_t jr_start, jr_end; \
|
||||
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
|
||||
\
|
||||
ctype* a_use; \
|
||||
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||
\
|
||||
/* Determine the packing buffer and related parameters for matrix
|
||||
A. (If A will not be packed, then a_use will be set to point to
|
||||
a and the _a_use strides will be set accordingly.) Then call
|
||||
the packm sup variant chooser, which will call the appropriate
|
||||
implementation based on the schema deduced from the stor_id. */ \
|
||||
PASTEMAC(ch,packm_sup_a) \
|
||||
( \
|
||||
packa, \
|
||||
stor_id, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
mc_cur, kc_cur, MR, \
|
||||
one, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
&a_use, &rs_a_use, &cs_a_use, \
|
||||
&ps_a_use, \
|
||||
cntx, \
|
||||
&mem_a, \
|
||||
thread \
|
||||
); \
|
||||
\
|
||||
/* Alias a_use so that it's clear this is our current block of
|
||||
matrix A. */ \
|
||||
ctype* restrict a_ic_use = a_use; \
|
||||
\
|
||||
/* Embed the panel stride of A within the auxinfo_t object. The
|
||||
millikernel will query and use this to iterate through
|
||||
micropanels of A (if needed). */ \
|
||||
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
|
||||
@@ -1263,10 +1498,10 @@ void PASTEMAC(ch,varname) \
|
||||
nr_cur, \
|
||||
kc_cur, \
|
||||
alpha_cast, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
b_jr, rs_b, cs_b, \
|
||||
a_ic_use, rs_a_use, cs_a_use, \
|
||||
b_jr, rs_b_use, cs_b_use, \
|
||||
beta_use, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
|
||||
821
frame/3/old/bli_l3_sup_var1n2m.c
Normal file
821
frame/3/old/bli_l3_sup_var1n2m.c
Normal file
@@ -0,0 +1,821 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemmsup_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
conj_t conja,
|
||||
conj_t conjb,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* restrict alpha,
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||
void* restrict beta,
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
stor3_t eff_id,
|
||||
cntx_t* restrict cntx,
|
||||
rntm_t* restrict rntm,
|
||||
cntl_t* restrict cntl,
|
||||
thrinfo_t* restrict thread
|
||||
);
|
||||
|
||||
//
|
||||
// -- var1n --------------------------------------------------------------------
|
||||
//
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
|
||||
|
||||
void bli_gemmsup_ref_var1n
|
||||
(
|
||||
trans_t trans,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
stor3_t eff_id,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
obj_t at, bt;
|
||||
|
||||
bli_obj_alias_to( a, &at );
|
||||
bli_obj_alias_to( b, &bt );
|
||||
|
||||
// Induce transpositions on A and/or B if either object is marked for
|
||||
// transposition. We can induce "fast" transpositions since they objects
|
||||
// are guaranteed to not have structure or be packed.
|
||||
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
|
||||
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
|
||||
|
||||
const num_t dt_exec = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
|
||||
const dim_t k = bli_obj_width( &at );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( &at );
|
||||
const inc_t rs_a = bli_obj_row_stride( &at );
|
||||
const inc_t cs_a = bli_obj_col_stride( &at );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
|
||||
const inc_t rs_b = bli_obj_row_stride( &bt );
|
||||
const inc_t cs_b = bli_obj_col_stride( &bt );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
#else
|
||||
|
||||
const num_t dt_exec = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
dim_t k;
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t rs_a;
|
||||
inc_t cs_a;
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b;
|
||||
inc_t cs_b;
|
||||
|
||||
if ( bli_obj_has_notrans( a ) )
|
||||
{
|
||||
k = bli_obj_width( a );
|
||||
|
||||
rs_a = bli_obj_row_stride( a );
|
||||
cs_a = bli_obj_col_stride( a );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( a ) )
|
||||
{
|
||||
// Assign the variables with an implicit transposition.
|
||||
k = bli_obj_length( a );
|
||||
|
||||
rs_a = bli_obj_col_stride( a );
|
||||
cs_a = bli_obj_row_stride( a );
|
||||
}
|
||||
|
||||
if ( bli_obj_has_notrans( b ) )
|
||||
{
|
||||
rs_b = bli_obj_row_stride( b );
|
||||
cs_b = bli_obj_col_stride( b );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( b ) )
|
||||
{
|
||||
// Assign the variables with an implicit transposition.
|
||||
rs_b = bli_obj_col_stride( b );
|
||||
cs_b = bli_obj_row_stride( b );
|
||||
}
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
#endif
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
FUNCPTR_T f = ftypes_var1n[dt_exec];
|
||||
|
||||
if ( bli_is_notrans( trans ) )
|
||||
{
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
eff_id,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invoke the function (transposing the operation).
|
||||
f
|
||||
(
|
||||
conjb, // swap the conj values.
|
||||
conja,
|
||||
n, // swap the m and n dimensions.
|
||||
m,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_b, cs_b, rs_b, // swap the positions of A and B.
|
||||
buf_a, cs_a, rs_a, // swap the strides of A and B.
|
||||
buf_beta,
|
||||
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
stor3_t stor_id, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
cntl_t* restrict cntl, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* If m or n is zero, return immediately. */ \
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* If k < 1 or alpha is zero, scale by beta and return. */ \
|
||||
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalm) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m, n, \
|
||||
beta, \
|
||||
c, rs_c, cs_c \
|
||||
); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* This transposition of the stor3_t id value is inherent to variant 1.
|
||||
The reason: we assume that variant 2 is the "main" variant. The
|
||||
consequence of this is that we assume that the millikernels that
|
||||
iterate over m are registered to the kernel group associated with
|
||||
the kernel preference. So, regardless of whether the mkernels are
|
||||
row- or column-preferential, millikernels that iterate over n are
|
||||
always placed in the slots for the opposite kernel group. */ \
|
||||
stor_id = bli_stor3_trans( stor_id ); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
dim_t KC; \
|
||||
if ( FALSE ) KC = KC0; \
|
||||
else if ( stor_id == BLIS_RRC || \
|
||||
stor_id == BLIS_CRC ) KC = KC0; \
|
||||
else if ( m <= MR && n <= NR ) KC = KC0; \
|
||||
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
|
||||
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
|
||||
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
||||
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||
\
|
||||
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
|
||||
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
|
||||
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
|
||||
\
|
||||
/* Query the maximum blocksize for MR, which implies a maximum blocksize
|
||||
extension for the final iteration. */ \
|
||||
const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t MRE = MRM - MR; \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = rs_c * NC; \
|
||||
const inc_t jcstep_a = rs_a * NC; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a * KC; \
|
||||
const inc_t pcstep_b = rs_b * KC; \
|
||||
\
|
||||
const inc_t icstep_c = cs_c * MC; \
|
||||
const inc_t icstep_b = cs_b * MC; \
|
||||
\
|
||||
const inc_t jrstep_c = rs_c * MR; \
|
||||
const inc_t jrstep_a = rs_a * MR; \
|
||||
\
|
||||
/*
|
||||
const inc_t irstep_c = cs_c * NR; \
|
||||
const inc_t irstep_b = cs_b * NR; \
|
||||
*/ \
|
||||
\
|
||||
/* Query the context for the sup microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemmsup_ker_ft) \
|
||||
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the outer
|
||||
dimensions.
|
||||
NOTE: Functionally speaking, we compute jc_iter as:
|
||||
jc_iter = m / NC; if ( jc_left ) ++jc_iter;
|
||||
However, this is implemented as:
|
||||
jc_iter = ( m + NC - 1 ) / NC;
|
||||
This avoids a branch at the cost of two additional integer instructions.
|
||||
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
|
||||
similar manner. */ \
|
||||
const dim_t jc_iter = ( m + NC - 1 ) / NC; \
|
||||
const dim_t jc_left = m % NC; \
|
||||
\
|
||||
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
|
||||
const dim_t pc_left = k % KC; \
|
||||
\
|
||||
const dim_t ic_iter = ( n + MC - 1 ) / MC; \
|
||||
const dim_t ic_left = n % MC; \
|
||||
\
|
||||
const dim_t jc_inc = 1; \
|
||||
const dim_t pc_inc = 1; \
|
||||
const dim_t ic_inc = 1; \
|
||||
const dim_t jr_inc = 1; \
|
||||
/*
|
||||
const dim_t ir_inc = 1; \
|
||||
*/ \
|
||||
\
|
||||
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||
{ \
|
||||
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
||||
dim_t jr_left = nc_cur % MR; \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to MRE
|
||||
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. */ \
|
||||
if ( 1 ) \
|
||||
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += MR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
|
||||
{ \
|
||||
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
|
||||
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||
\
|
||||
/* Loop over the n dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||
{ \
|
||||
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict b_ic = b_pc + ii * icstep_b; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
/*
|
||||
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
|
||||
const dim_t ir_left = mc_cur % NR; \
|
||||
*/ \
|
||||
\
|
||||
/* Loop over the m dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||
{ \
|
||||
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
|
||||
\
|
||||
ctype* restrict a_jr = a_pc + j * jrstep_a; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Loop over the n dimension (MR rows at a time). */ \
|
||||
{ \
|
||||
/* Invoke the gemmsup millikernel. */ \
|
||||
gemmsup_ker \
|
||||
( \
|
||||
conja, \
|
||||
conjb, \
|
||||
nr_cur, /* Notice: nr_cur <= MR. */ \
|
||||
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
|
||||
kc_cur, \
|
||||
alpha_cast, \
|
||||
a_jr, rs_a, cs_a, \
|
||||
b_ic, rs_b, cs_b, \
|
||||
beta_use, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
|
||||
|
||||
|
||||
//
|
||||
// -- var2m --------------------------------------------------------------------
|
||||
//
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
|
||||
|
||||
void bli_gemmsup_ref_var2m
|
||||
(
|
||||
trans_t trans,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
stor3_t eff_id,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
obj_t at, bt;
|
||||
|
||||
bli_obj_alias_to( a, &at );
|
||||
bli_obj_alias_to( b, &bt );
|
||||
|
||||
// Induce transpositions on A and/or B if either object is marked for
|
||||
// transposition. We can induce "fast" transpositions since they objects
|
||||
// are guaranteed to not have structure or be packed.
|
||||
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
|
||||
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
|
||||
|
||||
const num_t dt_exec = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
|
||||
const dim_t k = bli_obj_width( &at );
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( &at );
|
||||
const inc_t rs_a = bli_obj_row_stride( &at );
|
||||
const inc_t cs_a = bli_obj_col_stride( &at );
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
|
||||
const inc_t rs_b = bli_obj_row_stride( &bt );
|
||||
const inc_t cs_b = bli_obj_col_stride( &bt );
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
#else
|
||||
const num_t dt_exec = bli_obj_dt( c );
|
||||
|
||||
const conj_t conja = bli_obj_conj_status( a );
|
||||
const conj_t conjb = bli_obj_conj_status( b );
|
||||
|
||||
const dim_t m = bli_obj_length( c );
|
||||
const dim_t n = bli_obj_width( c );
|
||||
dim_t k;
|
||||
|
||||
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t rs_a;
|
||||
inc_t cs_a;
|
||||
|
||||
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b;
|
||||
inc_t cs_b;
|
||||
|
||||
if ( bli_obj_has_notrans( a ) )
|
||||
{
|
||||
k = bli_obj_width( a );
|
||||
|
||||
rs_a = bli_obj_row_stride( a );
|
||||
cs_a = bli_obj_col_stride( a );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( a ) )
|
||||
{
|
||||
// Assign the variables with an implicit transposition.
|
||||
k = bli_obj_length( a );
|
||||
|
||||
rs_a = bli_obj_col_stride( a );
|
||||
cs_a = bli_obj_row_stride( a );
|
||||
}
|
||||
|
||||
if ( bli_obj_has_notrans( b ) )
|
||||
{
|
||||
rs_b = bli_obj_row_stride( b );
|
||||
cs_b = bli_obj_col_stride( b );
|
||||
}
|
||||
else // if ( bli_obj_has_trans( b ) )
|
||||
{
|
||||
// Assign the variables with an implicit transposition.
|
||||
rs_b = bli_obj_col_stride( b );
|
||||
cs_b = bli_obj_row_stride( b );
|
||||
}
|
||||
|
||||
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||
const inc_t rs_c = bli_obj_row_stride( c );
|
||||
const inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||
|
||||
#endif
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
FUNCPTR_T f = ftypes_var2m[dt_exec];
|
||||
|
||||
if ( bli_is_notrans( trans ) )
|
||||
{
|
||||
// Invoke the function.
|
||||
f
|
||||
(
|
||||
conja,
|
||||
conjb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, rs_a, cs_a,
|
||||
buf_b, rs_b, cs_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
eff_id,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Invoke the function (transposing the operation).
|
||||
f
|
||||
(
|
||||
conjb, // swap the conj values.
|
||||
conja,
|
||||
n, // swap the m and n dimensions.
|
||||
m,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_b, cs_b, rs_b, // swap the positions of A and B.
|
||||
buf_a, cs_a, rs_a, // swap the strides of A and B.
|
||||
buf_beta,
|
||||
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||
cntx,
|
||||
rntm,
|
||||
cntl,
|
||||
thread
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
conj_t conja, \
|
||||
conj_t conjb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* restrict alpha, \
|
||||
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||
void* restrict beta, \
|
||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
stor3_t stor_id, \
|
||||
cntx_t* restrict cntx, \
|
||||
rntm_t* restrict rntm, \
|
||||
cntl_t* restrict cntl, \
|
||||
thrinfo_t* restrict thread \
|
||||
) \
|
||||
{ \
|
||||
/* If m or n is zero, return immediately. */ \
|
||||
if ( bli_zero_dim2( m, n ) ) return; \
|
||||
\
|
||||
/* If k < 1 or alpha is zero, scale by beta and return. */ \
|
||||
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalm) \
|
||||
( \
|
||||
BLIS_NO_CONJUGATE, \
|
||||
0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m, n, \
|
||||
beta, \
|
||||
c, rs_c, cs_c \
|
||||
); \
|
||||
return; \
|
||||
} \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Query the context for various blocksizes. */ \
|
||||
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||
\
|
||||
dim_t KC; \
|
||||
if ( stor_id == BLIS_RRR || \
|
||||
stor_id == BLIS_CCC ) KC = KC0; \
|
||||
else if ( stor_id == BLIS_RRC || \
|
||||
stor_id == BLIS_CRC ) KC = KC0; \
|
||||
else if ( m <= MR && n <= NR ) KC = KC0; \
|
||||
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
|
||||
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
|
||||
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
||||
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||
\
|
||||
/* Query the maximum blocksize for NR, which implies a maximum blocksize
|
||||
extension for the final iteration. */ \
|
||||
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||
const dim_t NRE = NRM - NR; \
|
||||
\
|
||||
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||
const inc_t jcstep_c = cs_c * NC; \
|
||||
const inc_t jcstep_b = cs_b * NC; \
|
||||
\
|
||||
const inc_t pcstep_a = cs_a * KC; \
|
||||
const inc_t pcstep_b = rs_b * KC; \
|
||||
\
|
||||
const inc_t icstep_c = rs_c * MC; \
|
||||
const inc_t icstep_a = rs_a * MC; \
|
||||
\
|
||||
const inc_t jrstep_c = cs_c * NR; \
|
||||
const inc_t jrstep_b = cs_b * NR; \
|
||||
\
|
||||
/*
|
||||
const inc_t irstep_c = rs_c * MR; \
|
||||
const inc_t irstep_a = rs_a * MR; \
|
||||
*/ \
|
||||
\
|
||||
/* Query the context for the sup microkernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemmsup_ker_ft) \
|
||||
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
|
||||
\
|
||||
ctype* restrict a_00 = a; \
|
||||
ctype* restrict b_00 = b; \
|
||||
ctype* restrict c_00 = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
\
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the outer
|
||||
dimensions.
|
||||
NOTE: Functionally speaking, we compute jc_iter as:
|
||||
jc_iter = n / NC; if ( jc_left ) ++jc_iter;
|
||||
However, this is implemented as:
|
||||
jc_iter = ( n + NC - 1 ) / NC;
|
||||
This avoids a branch at the cost of two additional integer instructions.
|
||||
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
|
||||
similar manner. */ \
|
||||
const dim_t jc_iter = ( n + NC - 1 ) / NC; \
|
||||
const dim_t jc_left = n % NC; \
|
||||
\
|
||||
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
|
||||
const dim_t pc_left = k % KC; \
|
||||
\
|
||||
const dim_t ic_iter = ( m + MC - 1 ) / MC; \
|
||||
const dim_t ic_left = m % MC; \
|
||||
\
|
||||
const dim_t jc_inc = 1; \
|
||||
const dim_t pc_inc = 1; \
|
||||
const dim_t ic_inc = 1; \
|
||||
const dim_t jr_inc = 1; \
|
||||
/*
|
||||
const dim_t ir_inc = 1; \
|
||||
*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||
{ \
|
||||
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
|
||||
\
|
||||
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||
\
|
||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||
dim_t jr_left = nc_cur % NR; \
|
||||
\
|
||||
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||
these cases.) Note that this prevents us from declaring jr_iter and
|
||||
jr_left as const. */ \
|
||||
if ( 1 ) \
|
||||
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||
{ \
|
||||
jr_iter--; jr_left += NR; \
|
||||
} \
|
||||
\
|
||||
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
|
||||
{ \
|
||||
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
|
||||
\
|
||||
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||
\
|
||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||
\
|
||||
/* Loop over the m dimension (MC rows at a time). */ \
|
||||
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||
{ \
|
||||
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
|
||||
\
|
||||
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||
\
|
||||
/*
|
||||
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||
const dim_t ir_left = mc_cur % MR; \
|
||||
*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||
{ \
|
||||
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||
\
|
||||
ctype* restrict b_jr = b_pc + j * jrstep_b; \
|
||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
{ \
|
||||
/* Invoke the gemmsup millikernel. */ \
|
||||
gemmsup_ker \
|
||||
( \
|
||||
conja, \
|
||||
conjb, \
|
||||
mc_cur, \
|
||||
nr_cur, \
|
||||
kc_cur, \
|
||||
alpha_cast, \
|
||||
a_ic, rs_a, cs_a, \
|
||||
b_jr, rs_b, cs_b, \
|
||||
beta_use, \
|
||||
c_jr, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
|
||||
|
||||
@@ -959,8 +959,7 @@ void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
|
||||
// Process each operation id tuple provided.
|
||||
for ( i = 0; i < n_ops; ++i )
|
||||
{
|
||||
// Read the current ukernel id, ukernel datatype, and ukernel function
|
||||
// pointer.
|
||||
// Read the current operation id and handler function pointer.
|
||||
const opid_t op_id = op_ids[ i ];
|
||||
void* op_fp = op_fps[ i ];
|
||||
|
||||
|
||||
95
frame/base/bli_env.c
Normal file
95
frame/base/bli_env.c
Normal file
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_env_get_var( const char* env, dim_t fallback )
|
||||
{
|
||||
dim_t r_val;
|
||||
char* str;
|
||||
|
||||
// Query the environment variable and store the result in str.
|
||||
str = getenv( env );
|
||||
|
||||
// Set the return value based on the string obtained from getenv().
|
||||
if ( str != NULL )
|
||||
{
|
||||
// If there was no error, convert the string to an integer and
|
||||
// prepare to return that integer.
|
||||
r_val = strtol( str, NULL, 10 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If there was an error, use the "fallback" as the return value.
|
||||
r_val = fallback;
|
||||
}
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
#if 0
|
||||
void bli_env_set_var( const char* env, dim_t value )
|
||||
{
|
||||
dim_t r_val;
|
||||
char value_str[32];
|
||||
const char* fs_32 = "%u";
|
||||
const char* fs_64 = "%lu";
|
||||
|
||||
// Convert the string to an integer, but vary the format specifier
|
||||
// depending on the integer type size.
|
||||
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
|
||||
else sprintf( value_str, fs_64, value );
|
||||
|
||||
// Set the environment variable using the string we just wrote to via
|
||||
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
|
||||
// value if the environment variable already exists.)
|
||||
r_val = bli_setenv( env, value_str, TRUE );
|
||||
|
||||
// Check the return value in case something went horribly wrong.
|
||||
if ( r_val == -1 )
|
||||
{
|
||||
char err_str[128];
|
||||
|
||||
// Query the human-readable error string corresponding to errno.
|
||||
strerror_r( errno, err_str, 128 );
|
||||
|
||||
// Print the error message.
|
||||
bli_print_msg( err_str, __FILE__, __LINE__ );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
44
frame/base/bli_env.h
Normal file
44
frame/base/bli_env.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_ENV_H
|
||||
#define BLIS_ENV_H
|
||||
|
||||
dim_t bli_env_get_var( const char* env, dim_t fallback );
|
||||
//void bli_env_set_var( const char* env, dim_t value );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,11 +34,32 @@
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BLIS_MEM_H
|
||||
#define BLIS_MEM_H
|
||||
|
||||
|
||||
// Mem entry query
|
||||
// mem_t object type (defined in bli_type_defs.h)
|
||||
|
||||
/*
|
||||
typedef struct mem_s
|
||||
{
|
||||
pblk_t pblk;
|
||||
packbuf_t buf_type;
|
||||
pool_t* pool;
|
||||
siz_t size;
|
||||
} mem_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void* buf;
|
||||
siz_t block_size;
|
||||
} pblk_t;
|
||||
*/
|
||||
|
||||
//
|
||||
// -- mem_t query --------------------------------------------------------------
|
||||
//
|
||||
|
||||
BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
|
||||
{
|
||||
@@ -78,7 +99,9 @@ BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
|
||||
}
|
||||
|
||||
|
||||
// Mem entry modification
|
||||
//
|
||||
// -- mem_t modification -------------------------------------------------------
|
||||
//
|
||||
|
||||
BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
|
||||
{
|
||||
|
||||
157
frame/base/bli_pack.c
Normal file
157
frame/base/bli_pack.c
Normal file
@@ -0,0 +1,157 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
|
||||
extern rntm_t global_rntm;
|
||||
|
||||
// A mutex to allow synchronous access to global_rntm. (The definition
|
||||
// resides in bli_rntm.c.)
|
||||
extern bli_pthread_mutex_t global_rntm_mutex;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_init( void )
|
||||
{
|
||||
// Read the environment variables and use them to initialize the
|
||||
// global runtime object.
|
||||
bli_pack_init_rntm_from_env( &global_rntm );
|
||||
}
|
||||
|
||||
void bli_pack_finalize( void )
|
||||
{
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_pack_get_pack_a( void )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
return bli_rntm_pack_a( &global_rntm );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_pack_get_pack_b( void )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
return bli_rntm_pack_b( &global_rntm );
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_set_pack_a( bool_t pack_a )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
// Acquire the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||
|
||||
bli_rntm_set_pack_a( pack_a, &global_rntm );
|
||||
|
||||
// Release the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_set_pack_b( bool_t pack_b )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
// Acquire the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||
|
||||
bli_rntm_set_pack_a( pack_b, &global_rntm );
|
||||
|
||||
// Release the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_pack_init_rntm_from_env
|
||||
(
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
// NOTE: We don't need to acquire the global_rntm_mutex here because this
|
||||
// function is only called from bli_pack_init(), which is only called
|
||||
// by bli_init_once().
|
||||
|
||||
bool_t pack_a;
|
||||
bool_t pack_b;
|
||||
|
||||
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
|
||||
|
||||
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
|
||||
// -1 if it is unset.
|
||||
pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
|
||||
pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
|
||||
|
||||
// Enforce the default behavior first, then check for affirmative FALSE, and
|
||||
// finally assume anything else is TRUE.
|
||||
if ( pack_a == -1 ) pack_a = FALSE; // default behavior
|
||||
else if ( pack_a == 0 ) pack_a = FALSE; // zero is FALSE
|
||||
else pack_a = TRUE; // anything else is TRUE
|
||||
|
||||
if ( pack_b == -1 ) pack_b = FALSE; // default behavior
|
||||
else if ( pack_b == 0 ) pack_b = FALSE; // zero is FALSE
|
||||
else pack_b = TRUE; // anything else is TRUE
|
||||
|
||||
#else
|
||||
|
||||
pack_a = TRUE;
|
||||
pack_b = TRUE;
|
||||
|
||||
#endif
|
||||
|
||||
// Save the results back in the runtime object.
|
||||
bli_rntm_set_pack_a( pack_a, rntm );
|
||||
bli_rntm_set_pack_b( pack_b, rntm );
|
||||
|
||||
#if 0
|
||||
printf( "bli_pack_init_rntm_from_env()\n" );
|
||||
bli_rntm_print( rntm );
|
||||
#endif
|
||||
}
|
||||
|
||||
49
frame/base/bli_pack.h
Normal file
49
frame/base/bli_pack.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_PACK_H
|
||||
#define BLIS_PACK_H
|
||||
|
||||
void bli_pack_init( void );
|
||||
void bli_pack_finalize( void );
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
|
||||
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a );
|
||||
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b );
|
||||
|
||||
void bli_pack_init_rntm_from_env( rntm_t* rntm );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,29 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
// The global rntm_t structure, which holds the global thread settings
|
||||
// along with a few other key parameters.
|
||||
rntm_t global_rntm;
|
||||
|
||||
// A mutex to allow synchronous access to global_rntm.
|
||||
bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_rntm_init_from_global( rntm_t* rntm )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
// Acquire the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||
|
||||
*rntm = global_rntm;
|
||||
|
||||
// Release the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_rntm_set_ways_for_op
|
||||
|
||||
@@ -52,11 +52,8 @@ typedef struct rntm_s
|
||||
bool l3_sup;
|
||||
|
||||
pool_t* sba_pool;
|
||||
|
||||
membrk_t* membrk;
|
||||
|
||||
bool_t l3_sup;
|
||||
|
||||
} rntm_t;
|
||||
*/
|
||||
|
||||
@@ -229,10 +226,6 @@ BLIS_INLINE void bli_rntm_clear_membrk( rntm_t* rntm )
|
||||
{
|
||||
bli_rntm_set_membrk( NULL, rntm );
|
||||
}
|
||||
static void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
||||
{
|
||||
bli_rntm_set_l3_sup( 1, rntm );
|
||||
}
|
||||
|
||||
//
|
||||
// -- rntm_t modification (public API) -----------------------------------------
|
||||
@@ -321,7 +314,6 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
||||
.l3_sup = TRUE, \
|
||||
.sba_pool = NULL, \
|
||||
.membrk = NULL, \
|
||||
.l3_sup = 1 \
|
||||
} \
|
||||
|
||||
BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
|
||||
@@ -330,11 +322,12 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
|
||||
|
||||
bli_rntm_clear_num_threads_only( rntm );
|
||||
bli_rntm_clear_ways_only( rntm );
|
||||
bli_rntm_clear_pack_a( rntm );
|
||||
bli_rntm_clear_pack_b( rntm );
|
||||
bli_rntm_clear_l3_sup( rntm );
|
||||
|
||||
bli_rntm_clear_sba_pool( rntm );
|
||||
bli_rntm_clear_membrk( rntm );
|
||||
|
||||
bli_rntm_clear_l3_sup( rntm );
|
||||
}
|
||||
|
||||
// -- rntm_t total thread calculation ------------------------------------------
|
||||
@@ -359,6 +352,8 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
|
||||
|
||||
// Function prototypes
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
|
||||
|
||||
BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
|
||||
(
|
||||
opid_t l3_op,
|
||||
|
||||
@@ -1185,6 +1185,13 @@ typedef struct
|
||||
inc_t is_a;
|
||||
inc_t is_b;
|
||||
|
||||
// The panel strides of A and B.
|
||||
// NOTE: These are only used in situations where iteration over the
|
||||
// micropanels takes place in part within the kernel code (e.g. sup
|
||||
// millikernels).
|
||||
inc_t ps_a;
|
||||
inc_t ps_b;
|
||||
|
||||
// The type to convert to on output.
|
||||
//num_t dt_on_output;
|
||||
|
||||
@@ -1441,6 +1448,9 @@ typedef struct cntx_s
|
||||
|
||||
// -- Runtime type --
|
||||
|
||||
// NOTE: The order of these fields must be kept consistent with the definition
|
||||
// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h.
|
||||
|
||||
typedef struct rntm_s
|
||||
{
|
||||
// "External" fields: these may be queried by the end-user.
|
||||
@@ -1460,9 +1470,6 @@ typedef struct rntm_s
|
||||
// The packing block allocator, which is attached in the l3 thread decorator.
|
||||
membrk_t* membrk;
|
||||
|
||||
// A switch to enable/disable small/unpacked matrix handling in level-3 ops.
|
||||
bool_t l3_sup;
|
||||
|
||||
} rntm_t;
|
||||
|
||||
|
||||
|
||||
@@ -130,6 +130,8 @@ extern "C" {
|
||||
#include "bli_getopt.h"
|
||||
#include "bli_opid.h"
|
||||
#include "bli_cntl.h"
|
||||
#include "bli_env.h"
|
||||
#include "bli_pack.h"
|
||||
#include "bli_info.h"
|
||||
#include "bli_arch.h"
|
||||
#include "bli_cpuid.h"
|
||||
|
||||
@@ -98,8 +98,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
for ( i = 0; i < nstage; ++i ) \
|
||||
@@ -191,8 +191,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
for ( i = 0; i < nstage; ++i ) \
|
||||
@@ -282,8 +282,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
for ( i = 0; i < nstage; ++i ) \
|
||||
@@ -358,8 +358,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Some induced methods execute in multiple "stages". */ \
|
||||
for ( i = 0; i < nstage; ++i ) \
|
||||
@@ -420,8 +420,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
{ \
|
||||
/* NOTE: trsm cannot be implemented via any induced method that
|
||||
|
||||
@@ -60,8 +60,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
func( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
}
|
||||
@@ -97,8 +97,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
func( side, alpha, a, b, beta, c, cntx, rntm ); \
|
||||
}
|
||||
@@ -131,8 +131,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
func( alpha, a, beta, c, cntx, rntm ); \
|
||||
}
|
||||
@@ -164,8 +164,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
func( side, alpha, a, b, cntx, rntm ); \
|
||||
}
|
||||
|
||||
@@ -66,8 +66,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
@@ -112,8 +112,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
@@ -150,8 +150,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
@@ -187,8 +187,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
@@ -223,8 +223,8 @@ void PASTEMAC(opname,imeth) \
|
||||
/* Initialize a local runtime with global settings if necessary. Note
|
||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||
rntm_t rntm_l; \
|
||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||
\
|
||||
/* Invoke the operation's front end. */ \
|
||||
PASTEMAC(opname,_front) \
|
||||
|
||||
77
frame/thread/bli_l3_decor.h
Normal file
77
frame/thread/bli_l3_decor.h
Normal file
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_DECOR_H
|
||||
#define BLIS_L3_DECOR_H
|
||||
|
||||
// -- conventional definitions -------------------------------------------------
|
||||
|
||||
// Level-3 internal function type.
|
||||
typedef void (*l3int_t)
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 thread decorator prototype.
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// Include definitions specific to the method of multithreading for the
|
||||
// conventional code path.
|
||||
#include "bli_l3_decor_single.h"
|
||||
#include "bli_l3_decor_openmp.h"
|
||||
#include "bli_l3_decor_pthreads.h"
|
||||
|
||||
#endif
|
||||
|
||||
248
frame/thread/bli_l3_decor_openmp.c
Normal file
248
frame/thread/bli_l3_decor_openmp.c
Normal file
@@ -0,0 +1,248 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||
// or no multithreading) we don't risk having an unresolved symbol.
|
||||
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
//#define PRINT_THRINFO
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
|
||||
#endif
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
#if 1
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
&c_t,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
#else
|
||||
bli_thrinfo_grow_tree
|
||||
(
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
#endif
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
threads[tid] = thread;
|
||||
#else
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
|
||||
else bli_l3_thrinfo_print_trsm_paths( threads );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_l3_thread_decorator_thread_check
|
||||
(
|
||||
dim_t n_threads,
|
||||
dim_t tid,
|
||||
thrcomm_t* gl_comm,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
dim_t n_threads_real = omp_get_num_threads();
|
||||
|
||||
// Check if the number of OpenMP threads created within this parallel
|
||||
// region is different from the number of threads that were requested
|
||||
// of BLIS. This inequality may trigger when, for example, the
|
||||
// following conditions are satisfied:
|
||||
// - an application is executing an OpenMP parallel region in which
|
||||
// BLIS is invoked,
|
||||
// - BLIS is configured for multithreading via OpenMP,
|
||||
// - OMP_NUM_THREADS = t > 1,
|
||||
// - the number of threads requested of BLIS (regardless of method)
|
||||
// is p <= t,
|
||||
// - OpenMP nesting is disabled.
|
||||
// In this situation, the application spawns t threads. Each application
|
||||
// thread calls gemm (for example). Each gemm will attempt to spawn p
|
||||
// threads via OpenMP. However, since nesting is disabled, the OpenMP
|
||||
// implementation finds that t >= p threads are already spawned, and
|
||||
// thus it doesn't spawn *any* additional threads for each gemm.
|
||||
if ( n_threads_real != n_threads )
|
||||
{
|
||||
// If the number of threads active in the current region is not
|
||||
// equal to the number requested of BLIS, we then only continue
|
||||
// if the number of threads in the current region is 1. If, for
|
||||
// example, BLIS requested 4 threads but only got 3, then we
|
||||
// abort().
|
||||
//if ( tid == 0 )
|
||||
//{
|
||||
if ( n_threads_real != 1 )
|
||||
{
|
||||
bli_print_msg( "A different number of threads was "
|
||||
"created than was requested.",
|
||||
__FILE__, __LINE__ );
|
||||
bli_abort();
|
||||
}
|
||||
|
||||
//n_threads = 1; // not needed since it has no effect?
|
||||
bli_thrcomm_init( 1, gl_comm );
|
||||
bli_rntm_set_num_threads_only( 1, rntm );
|
||||
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
|
||||
//}
|
||||
|
||||
// Synchronize all threads and continue.
|
||||
_Pragma( "omp barrier" )
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
53
frame/thread/bli_l3_decor_openmp.h
Normal file
53
frame/thread/bli_l3_decor_openmp.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_DECOR_OPENMP_H
|
||||
#define BLIS_L3_DECOR_OPENMP_H
|
||||
|
||||
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
void bli_l3_thread_decorator_thread_check
|
||||
(
|
||||
dim_t n_threads,
|
||||
dim_t tid,
|
||||
thrcomm_t* gl_comm,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
252
frame/thread/bli_l3_decor_pthreads.c
Normal file
252
frame/thread/bli_l3_decor_pthreads.c
Normal file
@@ -0,0 +1,252 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// A data structure to assist in passing operands to additional threads.
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3int_t func;
|
||||
opid_t family;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* b;
|
||||
obj_t* beta;
|
||||
obj_t* c;
|
||||
cntx_t* cntx;
|
||||
rntm_t* rntm;
|
||||
cntl_t* cntl;
|
||||
dim_t tid;
|
||||
thrcomm_t* gl_comm;
|
||||
array_t* array;
|
||||
} thread_data_t;
|
||||
|
||||
// Entry point for additional threads
|
||||
void* bli_l3_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
l3int_t func = data->func;
|
||||
opid_t family = data->family;
|
||||
pack_t schema_a = data->schema_a;
|
||||
pack_t schema_b = data->schema_b;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* b = data->b;
|
||||
obj_t* beta = data->beta;
|
||||
obj_t* c = data->c;
|
||||
cntx_t* cntx = data->cntx;
|
||||
rntm_t* rntm = data->rntm;
|
||||
cntl_t* cntl = data->cntl;
|
||||
dim_t tid = data->tid;
|
||||
array_t* array = data->array;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
&c_t,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// Query the total number of threads from the context.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||
// to the thread entry functions.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
||||
|
||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||
// can spawn all other threads before proceeding with its own computation.
|
||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||
{
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[tid].func = func;
|
||||
datas[tid].family = family;
|
||||
datas[tid].schema_a = schema_a;
|
||||
datas[tid].schema_b = schema_b;
|
||||
datas[tid].alpha = alpha;
|
||||
datas[tid].a = a;
|
||||
datas[tid].b = b;
|
||||
datas[tid].beta = beta;
|
||||
datas[tid].c = c;
|
||||
datas[tid].cntx = cntx;
|
||||
datas[tid].rntm = rntm;
|
||||
datas[tid].cntl = cntl;
|
||||
datas[tid].tid = tid;
|
||||
datas[tid].gl_comm = gl_comm;
|
||||
datas[tid].array = array;
|
||||
|
||||
// Spawn additional threads for ids greater than 1.
|
||||
if ( tid != 0 )
|
||||
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
|
||||
else
|
||||
bli_l3_thread_entry( ( void* )(&datas[0]) );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Thread 0 waits for additional threads to finish.
|
||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||
{
|
||||
bli_pthread_join( pthreads[tid], NULL );
|
||||
}
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( pthreads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( datas );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
47
frame/thread/bli_l3_decor_pthreads.h
Normal file
47
frame/thread/bli_l3_decor_pthreads.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_DECOR_PTHREADS_H
|
||||
#define BLIS_L3_DECOR_PTHREADS_H
|
||||
|
||||
// Definitions specific to situations when POSIX multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Thread entry point prototype.
|
||||
void* bli_l3_thread_entry( void* data_void );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
150
frame/thread/bli_l3_decor_single.c
Normal file
150
frame/thread/bli_l3_decor_single.c
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
a, b, c, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
frame/thread/bli_l3_decor_single.h
Normal file
44
frame/thread/bli_l3_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_DECOR_SINGLE_H
|
||||
#define BLIS_L3_DECOR_SINGLE_H
|
||||
|
||||
// Definitions specific to situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
78
frame/thread/bli_l3_sup_decor.h
Normal file
78
frame/thread/bli_l3_sup_decor.h
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_SUP_DECOR_H
|
||||
#define BLIS_L3_SUP_DECOR_H
|
||||
|
||||
// -- sup definitions ----------------------------------------------------------
|
||||
|
||||
// Level-3 sup internal function type.
|
||||
typedef err_t (*l3supint_t)
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 sup thread decorator prototype.
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
// Include definitions specific to the method of multithreading for the
|
||||
// sup code path.
|
||||
#include "bli_l3_sup_decor_single.h"
|
||||
#include "bli_l3_sup_decor_openmp.h"
|
||||
#include "bli_l3_sup_decor_pthreads.h"
|
||||
|
||||
#endif
|
||||
|
||||
190
frame/thread/bli_l3_sup_decor_openmp.c
Normal file
190
frame/thread/bli_l3_sup_decor_openmp.c
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
// Define a dummy function bli_l3_sup_thread_entry(), which is needed in the
|
||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||
// or no multithreading) we don't risk having an unresolved symbol.
|
||||
//void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
|
||||
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
|
||||
return
|
||||
bli_gemmsup_int
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
0
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
|
||||
( void )tid;
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
frame/thread/bli_l3_sup_decor_openmp.h
Normal file
44
frame/thread/bli_l3_sup_decor_openmp.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_SUP_DECOR_OPENMP_H
|
||||
#define BLIS_L3_SUP_DECOR_OPENMP_H
|
||||
|
||||
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_OPENMP
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
183
frame/thread/bli_l3_sup_decor_pthreads.c
Normal file
183
frame/thread/bli_l3_sup_decor_pthreads.c
Normal file
@@ -0,0 +1,183 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
|
||||
return
|
||||
bli_gemmsup_int
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
0
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
|
||||
( void )tid;
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
47
frame/thread/bli_l3_sup_decor_pthreads.h
Normal file
47
frame/thread/bli_l3_sup_decor_pthreads.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_SUP_DECOR_PTHREADS_H
|
||||
#define BLIS_L3_SUP_DECOR_PTHREADS_H
|
||||
|
||||
// Definitions specific to situations when POSIX multithreading is enabled.
|
||||
#ifdef BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Thread entry point prototype.
|
||||
void* bli_l3_sup_thread_entry( void* data_void );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
183
frame/thread/bli_l3_sup_decor_single.c
Normal file
183
frame/thread/bli_l3_sup_decor_single.c
Normal file
@@ -0,0 +1,183 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
err_t bli_l3_sup_thread_decorator
|
||||
(
|
||||
l3supint_t func,
|
||||
opid_t family,
|
||||
//pack_t schema_a,
|
||||
//pack_t schema_b,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
#if 0
|
||||
|
||||
return
|
||||
bli_gemmsup_int
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm,
|
||||
0
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
#if 0
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
#endif
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use = NULL;
|
||||
//thrinfo_t* thread = NULL;
|
||||
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||
#if 0
|
||||
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
#endif
|
||||
|
||||
( void )tid;
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
#if 0
|
||||
// Free the thread's local control tree.
|
||||
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
return BLIS_SUCCESS;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
44
frame/thread/bli_l3_sup_decor_single.h
Normal file
44
frame/thread/bli_l3_sup_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef BLIS_L3_SUP_DECOR_SINGLE_H
|
||||
#define BLIS_L3_SUP_DECOR_SINGLE_H
|
||||
|
||||
// Definitions specific to situations when multithreading is disabled.
|
||||
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -43,10 +43,6 @@
|
||||
#include "bli_thrcomm_pthreads.h"
|
||||
|
||||
|
||||
// thread entry point prototype.
|
||||
void* bli_l3_thread_entry( void* data_void );
|
||||
|
||||
|
||||
// thrcomm_t query (field only)
|
||||
|
||||
BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
|
||||
|
||||
@@ -214,212 +214,5 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||
// or no multithreading) we don't risk having an unresolved symbol.
|
||||
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
//#define PRINT_THRINFO
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// Query the total number of threads from the rntm_t object.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
|
||||
#endif
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||
{
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Query the thread's id from OpenMP.
|
||||
const dim_t tid = omp_get_thread_num();
|
||||
|
||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
#if 1
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
&c_t,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
#else
|
||||
bli_thrinfo_grow_tree
|
||||
(
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
#endif
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
threads[tid] = thread;
|
||||
#else
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
#endif
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
#ifdef PRINT_THRINFO
|
||||
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
|
||||
else bli_l3_thrinfo_print_trsm_paths( threads );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_l3_thread_decorator_thread_check
|
||||
(
|
||||
dim_t n_threads,
|
||||
dim_t tid,
|
||||
thrcomm_t* gl_comm,
|
||||
rntm_t* rntm
|
||||
)
|
||||
{
|
||||
dim_t n_threads_real = omp_get_num_threads();
|
||||
|
||||
// Check if the number of OpenMP threads created within this parallel
|
||||
// region is different from the number of threads that were requested
|
||||
// of BLIS. This inequality may trigger when, for example, the
|
||||
// following conditions are satisfied:
|
||||
// - an application is executing an OpenMP parallel region in which
|
||||
// BLIS is invoked,
|
||||
// - BLIS is configured for multithreading via OpenMP,
|
||||
// - OMP_NUM_THREADS = t > 1,
|
||||
// - the number of threads requested of BLIS (regardless of method)
|
||||
// is p <= t,
|
||||
// - OpenMP nesting is disabled.
|
||||
// In this situation, the application spawns t threads. Each application
|
||||
// thread calls gemm (for example). Each gemm will attempt to spawn p
|
||||
// threads via OpenMP. However, since nesting is disabled, the OpenMP
|
||||
// implementation finds that t >= p threads are already spawned, and
|
||||
// thus it doesn't spawn *any* additional threads for each gemm.
|
||||
if ( n_threads_real != n_threads )
|
||||
{
|
||||
// If the number of threads active in the current region is not
|
||||
// equal to the number requested of BLIS, we then only continue
|
||||
// if the number of threads in the current region is 1. If, for
|
||||
// example, BLIS requested 4 threads but only got 3, then we
|
||||
// abort().
|
||||
//if ( tid == 0 )
|
||||
//{
|
||||
if ( n_threads_real != 1 )
|
||||
{
|
||||
bli_print_msg( "A different number of threads was "
|
||||
"created than was requested.",
|
||||
__FILE__, __LINE__ );
|
||||
bli_abort();
|
||||
}
|
||||
|
||||
//n_threads = 1; // not needed since it has no effect?
|
||||
bli_thrcomm_init( 1, gl_comm );
|
||||
bli_rntm_set_num_threads_only( 1, rntm );
|
||||
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
|
||||
//}
|
||||
|
||||
// Synchronize all threads and continue.
|
||||
_Pragma( "omp barrier" )
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -85,14 +85,6 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier );
|
||||
void bli_thrcomm_tree_barrier( barrier_t* barack );
|
||||
#endif
|
||||
|
||||
void bli_l3_thread_decorator_thread_check
|
||||
(
|
||||
dim_t n_threads,
|
||||
dim_t tid,
|
||||
thrcomm_t* gl_comm,
|
||||
rntm_t* rntm
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -138,217 +138,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// A data structure to assist in passing operands to additional threads.
|
||||
typedef struct thread_data
|
||||
{
|
||||
l3int_t func;
|
||||
opid_t family;
|
||||
pack_t schema_a;
|
||||
pack_t schema_b;
|
||||
obj_t* alpha;
|
||||
obj_t* a;
|
||||
obj_t* b;
|
||||
obj_t* beta;
|
||||
obj_t* c;
|
||||
cntx_t* cntx;
|
||||
rntm_t* rntm;
|
||||
cntl_t* cntl;
|
||||
dim_t tid;
|
||||
thrcomm_t* gl_comm;
|
||||
array_t* array;
|
||||
} thread_data_t;
|
||||
|
||||
// Entry point for additional threads
|
||||
void* bli_l3_thread_entry( void* data_void )
|
||||
{
|
||||
thread_data_t* data = data_void;
|
||||
|
||||
l3int_t func = data->func;
|
||||
opid_t family = data->family;
|
||||
pack_t schema_a = data->schema_a;
|
||||
pack_t schema_b = data->schema_b;
|
||||
obj_t* alpha = data->alpha;
|
||||
obj_t* a = data->a;
|
||||
obj_t* b = data->b;
|
||||
obj_t* beta = data->beta;
|
||||
obj_t* c = data->c;
|
||||
cntx_t* cntx = data->cntx;
|
||||
rntm_t* rntm = data->rntm;
|
||||
cntl_t* cntl = data->cntl;
|
||||
dim_t tid = data->tid;
|
||||
array_t* array = data->array;
|
||||
thrcomm_t* gl_comm = data->gl_comm;
|
||||
|
||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||
// necessary since we want each thread to be able to track its own
|
||||
// small block pool_t as it executes down the function stack.
|
||||
rntm_t rntm_l = *rntm;
|
||||
rntm_t* restrict rntm_p = &rntm_l;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
obj_t a_t, b_t, c_t;
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||
// we pass down the algorithmic function stack. Making thread-local
|
||||
// alaises is highly recommended in case a thread needs to change any
|
||||
// of the properties of an object without affecting other threads'
|
||||
// objects.
|
||||
bli_obj_alias_to( a, &a_t );
|
||||
bli_obj_alias_to( b, &b_t );
|
||||
bli_obj_alias_to( c, &c_t );
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
&a_t,
|
||||
&b_t,
|
||||
beta,
|
||||
&c_t,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// Query the total number of threads from the context.
|
||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we have the rntm_t.sba_pool field
|
||||
// initialized and ready for the global communicator creation below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm. This will be
|
||||
// inherited by all of the child threads when they make local copies of
|
||||
// the rntm below.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allocate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||
// to the thread entry functions.
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
||||
|
||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||
// can spawn all other threads before proceeding with its own computation.
|
||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||
{
|
||||
// Set up thread data for additional threads (beyond thread 0).
|
||||
datas[tid].func = func;
|
||||
datas[tid].family = family;
|
||||
datas[tid].schema_a = schema_a;
|
||||
datas[tid].schema_b = schema_b;
|
||||
datas[tid].alpha = alpha;
|
||||
datas[tid].a = a;
|
||||
datas[tid].b = b;
|
||||
datas[tid].beta = beta;
|
||||
datas[tid].c = c;
|
||||
datas[tid].cntx = cntx;
|
||||
datas[tid].rntm = rntm;
|
||||
datas[tid].cntl = cntl;
|
||||
datas[tid].tid = tid;
|
||||
datas[tid].gl_comm = gl_comm;
|
||||
datas[tid].array = array;
|
||||
|
||||
// Spawn additional threads for ids greater than 1.
|
||||
if ( tid != 0 )
|
||||
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
|
||||
else
|
||||
bli_l3_thread_entry( ( void* )(&datas[0]) );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called from the thread entry function).
|
||||
|
||||
// Thread 0 waits for additional threads to finish.
|
||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||
{
|
||||
bli_pthread_join( pthreads[tid], NULL );
|
||||
}
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( pthreads );
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_l3_thread_decorator().pth: " );
|
||||
#endif
|
||||
bli_free_intl( datas );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -84,119 +84,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
|
||||
return;
|
||||
}
|
||||
|
||||
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||
// or no multithreading) we don't risk having an unresolved symbol.
|
||||
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
||||
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||
// we decided to transmit them via the schema field in the obj_t's
|
||||
// rather than pass them in as function parameters. Once the values
|
||||
// have been read, we immediately reset them back to their expected
|
||||
// values for unpacked objects.
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||
|
||||
// For sequential execution, we use only one thread.
|
||||
const dim_t n_threads = 1;
|
||||
|
||||
// NOTE: The sba was initialized in bli_init().
|
||||
|
||||
// Check out an array_t from the small block allocator. This is done
|
||||
// with an internal lock to ensure only one application thread accesses
|
||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||
// resize the array_t, if necessary.
|
||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||
|
||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||
// this up-front only so that we can create the global comm below.
|
||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||
|
||||
// Set the packing block allocator field of the rntm.
|
||||
bli_membrk_rntm_set_membrk( rntm );
|
||||
|
||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||
|
||||
|
||||
{
|
||||
// NOTE: We don't need to create another copy of the rntm_t since
|
||||
// it was already copied in one of the high-level oapi functions.
|
||||
rntm_t* restrict rntm_p = rntm;
|
||||
|
||||
cntl_t* cntl_use;
|
||||
thrinfo_t* thread;
|
||||
|
||||
const dim_t tid = 0;
|
||||
|
||||
// Use the thread id to access the appropriate pool_t* within the
|
||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||
// If the pool_t* element within the array_t is NULL, it will first
|
||||
// be allocated/initialized.
|
||||
// NOTE: This is commented out because, in the single-threaded case,
|
||||
// this is redundant since it's already been done above.
|
||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||
|
||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||
// need to alias objects for A, B, and C since they were already aliased
|
||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||
// consistently providing local aliases, we can then eliminate aliasing
|
||||
// elsewhere.
|
||||
|
||||
// Create a default control tree for the operation, if needed.
|
||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||
a, b, c, rntm_p, cntl, &cntl_use );
|
||||
|
||||
// Create the root node of the thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||
|
||||
func
|
||||
(
|
||||
alpha,
|
||||
a,
|
||||
b,
|
||||
beta,
|
||||
c,
|
||||
cntx,
|
||||
rntm_p,
|
||||
cntl_use,
|
||||
thread
|
||||
);
|
||||
|
||||
// Free the thread's local control tree.
|
||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||
|
||||
// Free the current thread's thrinfo_t structure.
|
||||
bli_l3_thrinfo_free( rntm_p, thread );
|
||||
}
|
||||
|
||||
// We shouldn't free the global communicator since it was already freed
|
||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||
// (called above).
|
||||
|
||||
// Check the array_t back into the small block allocator. Similar to the
|
||||
// check-out, this is done using a lock embedded within the sba to ensure
|
||||
// mutual exclusion.
|
||||
bli_sba_checkin_array( array );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -39,8 +39,12 @@ thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
|
||||
thrinfo_t BLIS_GEMM_SINGLE_THREADED = {};
|
||||
thrcomm_t BLIS_SINGLE_COMM = {};
|
||||
|
||||
// The global rntm_t structure, which holds the global thread settings.
|
||||
static rntm_t global_rntm;
|
||||
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
|
||||
extern rntm_t global_rntm;
|
||||
|
||||
// A mutex to allow synchronous access to global_rntm. (The definition
|
||||
// resides in bli_rntm.c.)
|
||||
extern bli_pthread_mutex_t global_rntm_mutex;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -1198,63 +1202,6 @@ dim_t bli_ipow( dim_t base, dim_t power )
|
||||
|
||||
return p;
|
||||
}
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_thread_get_env( const char* env, dim_t fallback )
|
||||
{
|
||||
dim_t r_val;
|
||||
char* str;
|
||||
|
||||
// Query the environment variable and store the result in str.
|
||||
str = getenv( env );
|
||||
|
||||
// Set the return value based on the string obtained from getenv().
|
||||
if ( str != NULL )
|
||||
{
|
||||
// If there was no error, convert the string to an integer and
|
||||
// prepare to return that integer.
|
||||
r_val = strtol( str, NULL, 10 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// If there was an error, use the "fallback" as the return value.
|
||||
r_val = fallback;
|
||||
}
|
||||
|
||||
return r_val;
|
||||
}
|
||||
|
||||
#if 0
|
||||
void bli_thread_set_env( const char* env, dim_t value )
|
||||
{
|
||||
dim_t r_val;
|
||||
char value_str[32];
|
||||
const char* fs_32 = "%u";
|
||||
const char* fs_64 = "%lu";
|
||||
|
||||
// Convert the string to an integer, but vary the format specifier
|
||||
// depending on the integer type size.
|
||||
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
|
||||
else sprintf( value_str, fs_64, value );
|
||||
|
||||
// Set the environment variable using the string we just wrote to via
|
||||
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
|
||||
// value if the environment variable already exists.)
|
||||
r_val = bli_setenv( env, value_str, TRUE );
|
||||
|
||||
// Check the return value in case something went horribly wrong.
|
||||
if ( r_val == -1 )
|
||||
{
|
||||
char err_str[128];
|
||||
|
||||
// Query the human-readable error string corresponding to errno.
|
||||
strerror_r( errno, err_str, 128 );
|
||||
|
||||
// Print the error message.
|
||||
bli_print_msg( err_str, __FILE__, __LINE__ );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
@@ -1308,9 +1255,6 @@ dim_t bli_thread_get_num_threads( void )
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// A mutex to allow synchronous access to global_rntm.
|
||||
static bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
@@ -1341,22 +1285,6 @@ void bli_thread_set_num_threads( dim_t n_threads )
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_init_rntm( rntm_t* rntm )
|
||||
{
|
||||
// We must ensure that global_rntm has been initialized.
|
||||
bli_init_once();
|
||||
|
||||
// Acquire the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||
|
||||
*rntm = global_rntm;
|
||||
|
||||
// Release the mutex protecting global_rntm.
|
||||
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_init_rntm_from_env
|
||||
(
|
||||
rntm_t* rntm
|
||||
@@ -1373,19 +1301,19 @@ void bli_thread_init_rntm_from_env
|
||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||
|
||||
// Try to read BLIS_NUM_THREADS first.
|
||||
nt = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
|
||||
nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
|
||||
|
||||
// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
|
||||
if ( nt == -1 )
|
||||
nt = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
|
||||
nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
|
||||
|
||||
// Read the environment variables for the number of threads (ways
|
||||
// of parallelism) for each individual loop.
|
||||
jc = bli_thread_get_env( "BLIS_JC_NT", -1 );
|
||||
pc = bli_thread_get_env( "BLIS_PC_NT", -1 );
|
||||
ic = bli_thread_get_env( "BLIS_IC_NT", -1 );
|
||||
jr = bli_thread_get_env( "BLIS_JR_NT", -1 );
|
||||
ir = bli_thread_get_env( "BLIS_IR_NT", -1 );
|
||||
jc = bli_env_get_var( "BLIS_JC_NT", -1 );
|
||||
pc = bli_env_get_var( "BLIS_PC_NT", -1 );
|
||||
ic = bli_env_get_var( "BLIS_IC_NT", -1 );
|
||||
jr = bli_env_get_var( "BLIS_JR_NT", -1 );
|
||||
ir = bli_env_get_var( "BLIS_IR_NT", -1 );
|
||||
|
||||
// If any BLIS_*_NT environment variable was set, then we ignore the
|
||||
// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
|
||||
|
||||
@@ -49,6 +49,14 @@
|
||||
#include "bli_packm_thrinfo.h"
|
||||
#include "bli_l3_thrinfo.h"
|
||||
|
||||
// Include the level-3 thread decorator and related definitions and prototypes
|
||||
// for the conventional code path.
|
||||
#include "bli_l3_decor.h"
|
||||
|
||||
// Include the level-3 thread decorator and related definitions and prototypes
|
||||
// for the sup code path.
|
||||
#include "bli_l3_sup_decor.h"
|
||||
|
||||
// Initialization-related prototypes.
|
||||
void bli_thread_init( void );
|
||||
void bli_thread_finalize( void );
|
||||
@@ -143,37 +151,6 @@ siz_t bli_thread_range_weighted_sub
|
||||
dim_t* restrict j_end_thr
|
||||
);
|
||||
|
||||
|
||||
|
||||
// Level-3 internal function type
|
||||
typedef void (*l3int_t)
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Level-3 thread decorator prototype
|
||||
void bli_l3_thread_decorator
|
||||
(
|
||||
l3int_t func,
|
||||
opid_t family,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
// Factorization and partitioning prototypes
|
||||
@@ -205,9 +182,6 @@ dim_t bli_ipow( dim_t base, dim_t power );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_env( const char* env, dim_t fallback );
|
||||
//void bli_thread_set_env( const char* env, dim_t value );
|
||||
|
||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
|
||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
|
||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );
|
||||
|
||||
@@ -54,6 +54,12 @@
|
||||
Therefore, this (r)ow-preferential microkernel is well-suited for
|
||||
a dot-product-based accumulation that performs vector loads from
|
||||
both A and B.
|
||||
|
||||
NOTE: These kernels implicitly support column-oriented IO, implemented
|
||||
via an a high-level transposition of the entire operation. A and B will
|
||||
effectively remain row- and column-stored, respectively, but C will then
|
||||
effectively appear column-stored. Thus, this kernel may be used for both
|
||||
rrc and crc cases.
|
||||
*/
|
||||
|
||||
// Prototype reference microkernels.
|
||||
|
||||
@@ -54,6 +54,12 @@
|
||||
Therefore, this (r)ow-preferential microkernel is well-suited for
|
||||
a dot-product-based accumulation that performs vector loads from
|
||||
both A and B.
|
||||
|
||||
NOTE: These kernels implicitly support column-oriented IO, implemented
|
||||
via an a high-level transposition of the entire operation. A and B will
|
||||
effectively remain row- and column-stored, respectively, but C will then
|
||||
effectively appear column-stored. Thus, this kernel may be used for both
|
||||
rrc and crc cases.
|
||||
*/
|
||||
|
||||
// Prototype reference microkernels.
|
||||
|
||||
@@ -156,12 +156,44 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
||||
beta, cij, rs_c0, cs_c0, data, cntx
|
||||
);
|
||||
#else
|
||||
bli_dgemv_ex
|
||||
(
|
||||
BLIS_NO_TRANSPOSE, conjb, m0, k0,
|
||||
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
|
||||
beta, cij, rs_c0, cntx, NULL
|
||||
);
|
||||
dim_t ps_a0 = bli_auxinfo_ps_a( data );
|
||||
|
||||
if ( ps_a0 == 6 * rs_a0 )
|
||||
{
|
||||
// Since A is not packed, we can use one gemv.
|
||||
bli_dgemv_ex
|
||||
(
|
||||
BLIS_NO_TRANSPOSE, conjb, m0, k0,
|
||||
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
|
||||
beta, cij, rs_c0, cntx, NULL
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
const dim_t mr = 6;
|
||||
|
||||
// Since A is packed into row panels, we must use a loop over
|
||||
// gemv.
|
||||
dim_t m_iter = ( m0 + mr - 1 ) / mr;
|
||||
dim_t m_left = m0 % mr;
|
||||
|
||||
double* restrict ai_ii = ai;
|
||||
double* restrict cij_ii = cij;
|
||||
|
||||
for ( dim_t ii = 0; ii < m_iter; ii += 1 )
|
||||
{
|
||||
dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
|
||||
? mr : m_left );
|
||||
|
||||
bli_dgemv_ex
|
||||
(
|
||||
BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
|
||||
alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
|
||||
beta, cij_ii, rs_c0, cntx, NULL
|
||||
);
|
||||
cij_ii += mr*rs_c0; ai_ii += ps_a0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return;
|
||||
@@ -185,6 +217,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of A and convert it to units of bytes.
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -819,8 +855,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
||||
lea(mem(r12, rdi, 4), r12) //
|
||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||
|
||||
lea(mem(r14, r8, 4), r14) //
|
||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
//lea(mem(r14, r8, 4), r14) //
|
||||
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
mov(var(ps_a8), rax) // load ps_a8
|
||||
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||
|
||||
dec(r11) // ii -= 1;
|
||||
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
||||
@@ -841,6 +879,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
||||
[a] "m" (a),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a8] "m" (ps_a8),
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
@@ -870,7 +909,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||
|
||||
double* restrict cij = c + i_edge*rs_c;
|
||||
double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||
double* restrict ai = a + m_iter * ps_a;
|
||||
double* restrict bj = b;
|
||||
|
||||
#if 0
|
||||
@@ -979,6 +1020,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of A and convert it to units of bytes.
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -1591,8 +1636,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
||||
lea(mem(r12, rdi, 4), r12) //
|
||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||
|
||||
lea(mem(r14, r8, 4), r14) //
|
||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
//lea(mem(r14, r8, 4), r14) //
|
||||
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
mov(var(ps_a8), rax) // load ps_a8
|
||||
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||
|
||||
dec(r11) // ii -= 1;
|
||||
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
||||
@@ -1613,6 +1660,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
||||
[a] "m" (a),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a8] "m" (ps_a8),
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
@@ -1642,7 +1690,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||
|
||||
double* restrict cij = c + i_edge*rs_c;
|
||||
double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||
double* restrict ai = a + m_iter * ps_a;
|
||||
double* restrict bj = b;
|
||||
|
||||
#if 0
|
||||
@@ -1751,6 +1801,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of A and convert it to units of bytes.
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -2241,8 +2295,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
||||
lea(mem(r12, rdi, 4), r12) //
|
||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||
|
||||
lea(mem(r14, r8, 4), r14) //
|
||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
//lea(mem(r14, r8, 4), r14) //
|
||||
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
mov(var(ps_a8), rax) // load ps_a8
|
||||
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||
|
||||
dec(r11) // ii -= 1;
|
||||
jne(.DLOOP6X4I) // iterate again if ii != 0.
|
||||
@@ -2263,6 +2319,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
||||
[a] "m" (a),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a8] "m" (ps_a8),
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
@@ -2292,7 +2349,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||
|
||||
double* restrict cij = c + i_edge*rs_c;
|
||||
double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||
double* restrict ai = a + m_iter * ps_a;
|
||||
double* restrict bj = b;
|
||||
|
||||
#if 0
|
||||
@@ -2401,6 +2460,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of A and convert it to units of bytes.
|
||||
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||
|
||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -2867,8 +2930,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
||||
lea(mem(r12, rdi, 4), r12) //
|
||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||
|
||||
lea(mem(r14, r8, 4), r14) //
|
||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
//lea(mem(r14, r8, 4), r14) //
|
||||
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||
mov(var(ps_a8), rax) // load ps_a8
|
||||
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||
|
||||
dec(r11) // ii -= 1;
|
||||
jne(.DLOOP6X2I) // iterate again if ii != 0.
|
||||
@@ -2889,6 +2954,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
||||
[a] "m" (a),
|
||||
[rs_a] "m" (rs_a),
|
||||
[cs_a] "m" (cs_a),
|
||||
[ps_a8] "m" (ps_a8),
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
@@ -2918,7 +2984,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||
|
||||
double* restrict cij = c + i_edge*rs_c;
|
||||
double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + i_edge*rs_a;
|
||||
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||
double* restrict ai = a + m_iter * ps_a;
|
||||
double* restrict bj = b;
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -195,6 +195,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -853,6 +857,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -880,7 +885,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
@@ -977,6 +984,10 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -1596,6 +1607,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -1623,7 +1635,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
@@ -1720,6 +1734,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -2248,6 +2266,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -2275,7 +2294,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
@@ -2363,6 +2384,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -2921,6 +2946,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -2948,7 +2974,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
@@ -3036,6 +3064,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -3475,6 +3507,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -3502,7 +3535,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
@@ -3590,6 +3625,10 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
||||
uint64_t rs_c = rs_c0;
|
||||
uint64_t cs_c = cs_c0;
|
||||
|
||||
// Query the panel stride of B and convert it to units of bytes.
|
||||
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||
|
||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -3993,6 +4032,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
||||
[b] "m" (b),
|
||||
[rs_b] "m" (rs_b),
|
||||
[cs_b] "m" (cs_b),
|
||||
[ps_b8] "m" (ps_b8),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
@@ -4020,7 +4060,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
||||
|
||||
double* restrict cij = c + j_edge*cs_c;
|
||||
double* restrict ai = a;
|
||||
double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + j_edge*cs_b;
|
||||
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||
double* restrict bj = b + n_iter * ps_b;
|
||||
|
||||
if ( 6 <= n_left )
|
||||
{
|
||||
|
||||
@@ -829,12 +829,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
||||
rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
|
||||
dim_t m = 1000, n = 1000, k = 1000;
|
||||
|
||||
bli_thread_init_rntm( &gemm );
|
||||
bli_thread_init_rntm( &herk );
|
||||
bli_thread_init_rntm( &trmm_l );
|
||||
bli_thread_init_rntm( &trmm_r );
|
||||
bli_thread_init_rntm( &trsm_l );
|
||||
bli_thread_init_rntm( &trsm_r );
|
||||
bli_rntm_init_from_global( &gemm );
|
||||
bli_rntm_init_from_global( &herk );
|
||||
bli_rntm_init_from_global( &trmm_l );
|
||||
bli_rntm_init_from_global( &trmm_r );
|
||||
bli_rntm_init_from_global( &trsm_l );
|
||||
bli_rntm_init_from_global( &trsm_r );
|
||||
|
||||
bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT, m, n, k, &gemm );
|
||||
bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT, m, n, k, &herk );
|
||||
|
||||
Reference in New Issue
Block a user