mirror of
https://github.com/amd/blis.git
synced 2026-05-12 18:15:37 +00:00
Added support for selective packing to gemmsup.
Details: - Implemented optional packing for A or B (or both) within the sup framework (which currently only supports gemm). The request for packing either matrix A or matrix B can be made via setting environment variables BLIS_PACK_A or BLIS_PACK_B (to any non-zero value; if set, zero means "disable packing"). It can also be made globally at runtime via bli_pack_set_pack_a() and bli_pack_set_pack_b() or with individual rntm_t objects via bli_rntm_set_pack_a() and bli_rntm_set_pack_b() if using the expert interface of either the BLIS typed or object APIs. (If using the BLAS API, environment variables are the only way to communicate the packing request.) - One caveat (for now) with the current implementation of selective packing is that any blocksize extension registered in the _cntx_init function (such as is currently used by haswell and zen subconfigs) will be ignored if the affected matrix is packed. The reason is simply that I didn't get around to implementing the necessary logic to pack a larger edge-case micropanel, though this is entirely possible and should be done in the future. - Spun off the variant-choosing portion of bli_gemmsup_ref() into bli_gemmsup_int(), in bli_l3_sup_int.c. - Added new files, bli_l3_sup_packm_a.c, bli_l3_sup_packm_b.c, along with corresponding headers, in which higher-level packm-related functions are defined for use within the sup framework. The actual packm variant code resides in bli_l3_sup_packm_var.c. - Pass the following new parameters into var1n and var2m: packa, packb bool_t's, pointer to a rntm_t, pointer to a cntl_t (which is for now always NULL), and pointer to a thrinfo_t* (which for nowis the address of the global single-threaded packm thread control node). - Added panel strides ps_a and ps_b to the auxinfo_t structure so that the millikernel can query the panel stride of the packed matrix and step through it accordingly. If the matrix isn't packed, the panel stride of interest for the given millikernel will be set to the appropriate value so that the mkernel may step through the unpacked matrix as it normally would. - Modified the rv_6x8m and rv_6x8n millikernels to read the appropriate panel strides (ps_a and ps_b, respectively) instead of computing them on the fly. - Spun off the environment variable getting and setting functions into a new file, bli_env.c (with a corresponding prototype header). These functions are now used by the threading infrastructure (e.g. BLIS_NUM_THREADS, BLIS_JC_NT, etc.) as well as the selective packing infrastructure (e.g. BLIS_PACK_A, BLIS_PACK_B). - Added a static initializer for mem_t objects, BLIS_MEM_INITIALIZER. - Added a static initializer for pblk_t objects, BLIS_PBLK_INITIALIZER, for use within the definition of BLIS_MEM_INITIALIZER. - Moved the global_rntm object to bli_rntm.c and extern it where needed. This means that the function bli_thread_init_rntm() was renamed to bli_rntm_init_from_global() and relocated accordingly. - Added a new bli_pack.c function, which serves as the home for functions that manage the pack_a and pack_b fields of the global rntm_t, including from environment variables, just as we have functions to manage the threading fields of the global rntm_t in bli_thread.c. - Reorganized naming for files in frame/thread, which mostly involved spinning off the bli_l3_thread_decorator() functions into their own files. This change makes more sense when considering the further addition of bli_l3_sup_thread_decorator() functions (for now limited only to the single-threaded form found in the _single.c file). - Explicitly initialize the reference sup handlers in both bli_cntx_init_haswell.c and bli_cntx_init_zen.c so that it's more obvious how to customize to a different handler, if desired. - Removed various snippets of disabled code. - Various comment updates.
This commit is contained in:
@@ -174,6 +174,16 @@ void bli_cntx_init_haswell( cntx_t* cntx )
|
|||||||
cntx
|
cntx
|
||||||
);
|
);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Initialize the context with the sup handlers.
|
||||||
|
bli_cntx_set_l3_sup_handlers
|
||||||
|
(
|
||||||
|
1,
|
||||||
|
BLIS_GEMM, bli_gemmsup_ref,
|
||||||
|
cntx
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
// Update the context with optimized small/unpacked gemm kernels.
|
||||||
bli_cntx_set_l3_sup_kers
|
bli_cntx_set_l3_sup_kers
|
||||||
(
|
(
|
||||||
|
|||||||
@@ -184,6 +184,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
|||||||
cntx
|
cntx
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Initialize the context with the sup handlers.
|
||||||
|
bli_cntx_set_l3_sup_handlers
|
||||||
|
(
|
||||||
|
1,
|
||||||
|
BLIS_GEMM, bli_gemmsup_ref,
|
||||||
|
cntx
|
||||||
|
);
|
||||||
|
|
||||||
// Update the context with optimized small/unpacked gemm kernels.
|
// Update the context with optimized small/unpacked gemm kernels.
|
||||||
bli_cntx_set_l3_sup_kers
|
bli_cntx_set_l3_sup_kers
|
||||||
(
|
(
|
||||||
|
|||||||
@@ -73,7 +73,11 @@
|
|||||||
|
|
||||||
// Prototype reference implementation of small/unpacked matrix handler.
|
// Prototype reference implementation of small/unpacked matrix handler.
|
||||||
#include "bli_l3_sup_ref.h"
|
#include "bli_l3_sup_ref.h"
|
||||||
|
#include "bli_l3_sup_int.h"
|
||||||
#include "bli_l3_sup_vars.h"
|
#include "bli_l3_sup_vars.h"
|
||||||
|
#include "bli_l3_sup_packm_a.h"
|
||||||
|
#include "bli_l3_sup_packm_b.h"
|
||||||
|
#include "bli_l3_sup_packm_var.h"
|
||||||
|
|
||||||
// Prototype microkernel wrapper APIs.
|
// Prototype microkernel wrapper APIs.
|
||||||
#include "bli_l3_ukr_oapi.h"
|
#include "bli_l3_ukr_oapi.h"
|
||||||
|
|||||||
@@ -60,14 +60,6 @@ err_t bli_gemmsup
|
|||||||
// that function assumes the context pointer is valid.
|
// that function assumes the context pointer is valid.
|
||||||
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||||
|
|
||||||
#if 0
|
|
||||||
// Initialize a local runtime with global settings if necessary. Note
|
|
||||||
// that in the case that a runtime is passed in, we make a local copy.
|
|
||||||
rntm_t rntm_l;
|
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; }
|
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Return early if a microkernel preference-induced transposition would
|
// Return early if a microkernel preference-induced transposition would
|
||||||
// have been performed and shifted the dimensions outside of the space
|
// have been performed and shifted the dimensions outside of the space
|
||||||
// of sup-handled problems.
|
// of sup-handled problems.
|
||||||
@@ -94,6 +86,12 @@ err_t bli_gemmsup
|
|||||||
return BLIS_FAILURE;
|
return BLIS_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initialize a local runtime with global settings if necessary. Note
|
||||||
|
// that in the case that a runtime is passed in, we make a local copy.
|
||||||
|
rntm_t rntm_l;
|
||||||
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
|
||||||
|
else { rntm_l = *rntm; rntm = &rntm_l; }
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
const num_t dt = bli_obj_dt( c );
|
const num_t dt = bli_obj_dt( c );
|
||||||
const dim_t m = bli_obj_length( c );
|
const dim_t m = bli_obj_length( c );
|
||||||
|
|||||||
173
frame/3/bli_l3_sup_int.c
Normal file
173
frame/3/bli_l3_sup_int.c
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
err_t bli_gemmsup_int
|
||||||
|
(
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
//bli_gemmsup_ref_var2
|
||||||
|
//bli_gemmsup_ref_var1
|
||||||
|
#if 0
|
||||||
|
bli_gemmsup_ref_var1n
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||||
|
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||||
|
stor_id == BLIS_RRC ||
|
||||||
|
stor_id == BLIS_RCR ||
|
||||||
|
stor_id == BLIS_CRR );
|
||||||
|
if ( is_rrr_rrc_rcr_crr )
|
||||||
|
{
|
||||||
|
bli_gemmsup_ref_var2m
|
||||||
|
(
|
||||||
|
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bli_gemmsup_ref_var2m
|
||||||
|
(
|
||||||
|
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return BLIS_SUCCESS;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
||||||
|
|
||||||
|
// Don't use the small/unpacked implementation if one of the matrices
|
||||||
|
// uses general stride.
|
||||||
|
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
|
||||||
|
|
||||||
|
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
||||||
|
stor_id == BLIS_RRC ||
|
||||||
|
stor_id == BLIS_RCR ||
|
||||||
|
stor_id == BLIS_CRR );
|
||||||
|
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
||||||
|
|
||||||
|
const num_t dt = bli_obj_dt( c );
|
||||||
|
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
||||||
|
|
||||||
|
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
||||||
|
: is_rcc_crc_ccr_ccc );
|
||||||
|
|
||||||
|
if ( is_primary )
|
||||||
|
{
|
||||||
|
// This branch handles:
|
||||||
|
// - rrr rrc rcr crr for row-preferential kernels
|
||||||
|
// - rcc crc ccr ccc for column-preferential kernels
|
||||||
|
|
||||||
|
const dim_t m = bli_obj_length( c );
|
||||||
|
const dim_t n = bli_obj_width( c );
|
||||||
|
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||||
|
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||||
|
const dim_t mu = m / MR;
|
||||||
|
const dim_t nu = n / NR;
|
||||||
|
|
||||||
|
if ( mu >= nu )
|
||||||
|
//if ( m % 2 == 1 && n % 2 == 1 )
|
||||||
|
{
|
||||||
|
#ifdef TRACEVAR
|
||||||
|
printf( "bli_l3_sup_int(): var2m primary\n" );
|
||||||
|
#endif
|
||||||
|
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
||||||
|
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
||||||
|
alpha, a, b, beta, c,
|
||||||
|
stor_id, cntx, rntm, cntl, thread );
|
||||||
|
}
|
||||||
|
else // if ( mu < nu )
|
||||||
|
{
|
||||||
|
#ifdef TRACEVAR
|
||||||
|
printf( "bli_l3_sup_int(): var1n primary\n" );
|
||||||
|
#endif
|
||||||
|
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
||||||
|
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
||||||
|
alpha, a, b, beta, c,
|
||||||
|
stor_id, cntx, rntm, cntl, thread );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// This branch handles:
|
||||||
|
// - rrr rrc rcr crr for column-preferential kernels
|
||||||
|
// - rcc crc ccr ccc for row-preferential kernels
|
||||||
|
|
||||||
|
const dim_t mt = bli_obj_width( c );
|
||||||
|
const dim_t nt = bli_obj_length( c );
|
||||||
|
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||||
|
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||||
|
const dim_t mu = mt / MR;
|
||||||
|
const dim_t nu = nt / NR;
|
||||||
|
|
||||||
|
if ( mu >= nu )
|
||||||
|
//if ( mt % 2 == 1 && nt % 2 == 1 )
|
||||||
|
{
|
||||||
|
#ifdef TRACEVAR
|
||||||
|
printf( "bli_l3_sup_int(): var2m non-primary\n" );
|
||||||
|
#endif
|
||||||
|
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
||||||
|
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
||||||
|
alpha, a, b, beta, c,
|
||||||
|
stor_id, cntx, rntm, cntl, thread );
|
||||||
|
}
|
||||||
|
else // if ( mu < nu )
|
||||||
|
{
|
||||||
|
#ifdef TRACEVAR
|
||||||
|
printf( "bli_l3_sup_int(): var1n non-primary\n" );
|
||||||
|
#endif
|
||||||
|
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
||||||
|
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
||||||
|
alpha, a, b, beta, c,
|
||||||
|
stor_id, cntx, rntm, cntl, thread );
|
||||||
|
}
|
||||||
|
// *requires nudging of mc,nc up to be a multiple of nr,mr.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return success so that the caller knows that we computed the solution.
|
||||||
|
return BLIS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
46
frame/3/bli_l3_sup_int.h
Normal file
46
frame/3/bli_l3_sup_int.h
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
err_t bli_gemmsup_int
|
||||||
|
(
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
);
|
||||||
384
frame/3/bli_l3_sup_packm_a.c
Normal file
384
frame/3/bli_l3_sup_packm_a.c
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
packbuf_t pack_buf_type, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we are going to be packing matrix A. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
packbuf_t pack_buf_type_use; \
|
||||||
|
\
|
||||||
|
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||||
|
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||||
|
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||||
|
the other micropanels. Why? So that millikernels can use the same
|
||||||
|
upanel ldim for all iterations of the ir loop. */ \
|
||||||
|
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||||
|
const dim_t k_pack = k; \
|
||||||
|
\
|
||||||
|
/* Determine the dimensions and strides for the packed matrix A. */ \
|
||||||
|
if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) \
|
||||||
|
{ \
|
||||||
|
/* stor3_t id values _RRC and _CRC: pack A to plain row storage,
|
||||||
|
which can use packing buffer type for general usage. */ \
|
||||||
|
pack_buf_type_use = BLIS_BUFFER_FOR_GEN_USE; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* All other stor3_t ids: pack A to column-stored row-panels
|
||||||
|
using the packing buffer type as specified by the caller. */ \
|
||||||
|
/*pack_buf_type_use = BLIS_BUFFER_FOR_A_BLOCK;*/ \
|
||||||
|
pack_buf_type_use = pack_buf_type; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Compute the size of the memory block eneded. */ \
|
||||||
|
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
|
||||||
|
\
|
||||||
|
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||||
|
then we need to acquire a block from the memory broker. */ \
|
||||||
|
if ( bli_mem_is_unalloc( mem ) ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_acquire_m \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
size_needed, \
|
||||||
|
pack_buf_type_use, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* NOTE: This shouldn't execute since the sup code path calls this
|
||||||
|
function only once, before *any* loops of the gemm algorithm are
|
||||||
|
encountered. */ \
|
||||||
|
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||||
|
\
|
||||||
|
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||||
|
buffer, then a block has already been acquired from the memory
|
||||||
|
broker and cached by the caller. */ \
|
||||||
|
\
|
||||||
|
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||||
|
associated with a block that is too small compared to the size of
|
||||||
|
the packed matrix buffer that is needed, according to the value
|
||||||
|
computed above. */ \
|
||||||
|
siz_t mem_size = bli_mem_size( mem ); \
|
||||||
|
\
|
||||||
|
if ( mem_size < size_needed ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_release \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
bli_membrk_acquire_m \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
size_needed, \
|
||||||
|
pack_buf_type_use, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* If the mem_t entry is already allocated and sufficiently large,
|
||||||
|
then we use it as-is. No action is needed. */ \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t did_pack, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we previously packed matrix A. */ \
|
||||||
|
if ( did_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
/* If we didn't pack matrix A, there's nothing to be done. */ \
|
||||||
|
} \
|
||||||
|
else /* if ( did_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||||
|
is allocated, which it should be. */ \
|
||||||
|
if ( bli_mem_is_alloc( mem ) ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_release \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
pack_t* restrict schema, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
dim_t* restrict m_max, \
|
||||||
|
dim_t* restrict k_max, \
|
||||||
|
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||||
|
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we are going to be packing matrix A. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
*m_max = m; \
|
||||||
|
*k_max = k; \
|
||||||
|
\
|
||||||
|
/* Set the parameters for use with no packing of A (ie: using the
|
||||||
|
source matrix A directly). */ \
|
||||||
|
{ \
|
||||||
|
/* Use the strides of the source matrix as the final values. */ \
|
||||||
|
*rs_p = rs_x; \
|
||||||
|
*cs_p = cs_x; \
|
||||||
|
\
|
||||||
|
*pd_p = mr; \
|
||||||
|
*ps_p = mr * rs_x; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "not packed" to indicate that packing will be
|
||||||
|
skipped. */ \
|
||||||
|
*schema = BLIS_NOT_PACKED; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Since we won't be packing, simply update the buffer address provided
|
||||||
|
by the caller to point to source matrix. */ \
|
||||||
|
*p = x; \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||||
|
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||||
|
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||||
|
the other micropanels. Why? So that millikernels can use the same
|
||||||
|
upanel ldim for all iterations of the ir loop. */ \
|
||||||
|
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
|
||||||
|
*k_max = k; \
|
||||||
|
\
|
||||||
|
/* Determine the dimensions and strides for the packed matrix A. */ \
|
||||||
|
if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) \
|
||||||
|
{ \
|
||||||
|
/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \
|
||||||
|
*rs_p = k; \
|
||||||
|
*cs_p = 1; \
|
||||||
|
\
|
||||||
|
*pd_p = mr; \
|
||||||
|
*ps_p = mr * k; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "row packed" to indicate packing to plain
|
||||||
|
row storage. */ \
|
||||||
|
*schema = BLIS_PACKED_ROWS; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* All other stor3_t ids: pack A to column-stored row-panels. */ \
|
||||||
|
*rs_p = 1; \
|
||||||
|
*cs_p = mr; \
|
||||||
|
\
|
||||||
|
*pd_p = mr; \
|
||||||
|
*ps_p = mr * k; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "packed row panels" to indicate packing to
|
||||||
|
conventional column-stored row panels. */ \
|
||||||
|
*schema = BLIS_PACKED_ROW_PANELS; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Set the buffer address provided by the caller to point to the
|
||||||
|
memory associated with the mem_t entry acquired from the memory
|
||||||
|
broker. */ \
|
||||||
|
*p = bli_mem_buffer( mem ); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Define BLAS-like interfaces to the variant chooser.
|
||||||
|
//
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
trans_t transc, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||||
|
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
pack_t schema; \
|
||||||
|
dim_t m_max; \
|
||||||
|
dim_t k_max; \
|
||||||
|
dim_t pd_p; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix A. If A
|
||||||
|
will not be packed, then a_use will be set to point to a and the _a_use
|
||||||
|
strides will be set accordingly. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_a) \
|
||||||
|
( \
|
||||||
|
will_pack, \
|
||||||
|
stor_id, \
|
||||||
|
&schema, \
|
||||||
|
m, k, mr, \
|
||||||
|
&m_max, &k_max, \
|
||||||
|
a, rs_a, cs_a, \
|
||||||
|
p, rs_p, cs_p, \
|
||||||
|
&pd_p, ps_p, \
|
||||||
|
cntx, \
|
||||||
|
mem, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Inspect whether we are going to be packing matrix A. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
/* If we aren't going to pack matrix A, then there's nothing to do. */ \
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_a: not packing A.\n" ); \
|
||||||
|
*/ \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
if ( schema == BLIS_PACKED_ROWS ) \
|
||||||
|
{ \
|
||||||
|
/* For plain packing by rows, use copym.
|
||||||
|
NOTE: We assume kappa = 1; otherwise, we need scal2m. */ \
|
||||||
|
\
|
||||||
|
/* NOTE: This call to copym must be replaced by a proper packm
|
||||||
|
variant, implemented as a loop over copym, once multithreading
|
||||||
|
support is added. */ \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
|
||||||
|
*/ \
|
||||||
|
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
|
||||||
|
( \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
transc, \
|
||||||
|
m, \
|
||||||
|
k, \
|
||||||
|
a, rs_a, cs_a, \
|
||||||
|
*p, *rs_p, *cs_p, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
|
||||||
|
{ \
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
|
||||||
|
*/ \
|
||||||
|
/* For packing to column-stored row panels, use var1. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_var1) \
|
||||||
|
( \
|
||||||
|
transc, \
|
||||||
|
schema, \
|
||||||
|
m, \
|
||||||
|
k, \
|
||||||
|
m_max, \
|
||||||
|
k_max, \
|
||||||
|
kappa, \
|
||||||
|
a, rs_a, cs_a, \
|
||||||
|
*p, *rs_p, *cs_p, \
|
||||||
|
pd_p, *ps_p, \
|
||||||
|
cntx, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_a )
|
||||||
|
|
||||||
115
frame/3/bli_l3_sup_packm_a.h
Normal file
115
frame/3/bli_l3_sup_packm_a.h
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
packbuf_t pack_buf_type, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t did_pack, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
pack_t* restrict schema, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
dim_t* restrict m_max, \
|
||||||
|
dim_t* restrict k_max, \
|
||||||
|
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||||
|
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_init_a )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
trans_t transc, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t mr, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||||
|
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_a )
|
||||||
|
|
||||||
384
frame/3/bli_l3_sup_packm_b.c
Normal file
384
frame/3/bli_l3_sup_packm_b.c
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
packbuf_t pack_buf_type, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we are going to be packing matrix B. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
packbuf_t pack_buf_type_use; \
|
||||||
|
\
|
||||||
|
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||||
|
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||||
|
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||||
|
the other micropanels. Why? So that millikernels can use the same
|
||||||
|
upanel ldim for all iterations of the ir loop. */ \
|
||||||
|
const dim_t k_pack = k; \
|
||||||
|
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||||
|
\
|
||||||
|
/* Determine the dimensions and strides for the packed matrix B. */ \
|
||||||
|
if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) \
|
||||||
|
{ \
|
||||||
|
/* stor3_t id values _RRC and _CRC: pack B to plain column storage,
|
||||||
|
which can use packing buffer type for general usage. */ \
|
||||||
|
pack_buf_type_use = BLIS_BUFFER_FOR_GEN_USE; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* All other stor3_t ids: pack A to row-stored column-panels
|
||||||
|
using the packing buffer type as specified by the caller. */ \
|
||||||
|
/*pack_buf_type_use = BLIS_BUFFER_FOR_B_PANEL;*/ \
|
||||||
|
pack_buf_type_use = pack_buf_type; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Compute the size of the memory block eneded. */ \
|
||||||
|
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
|
||||||
|
\
|
||||||
|
/* Check the mem_t entry provided by the caller. If it is unallocated,
|
||||||
|
then we need to acquire a block from the memory broker. */ \
|
||||||
|
if ( bli_mem_is_unalloc( mem ) ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_acquire_m \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
size_needed, \
|
||||||
|
pack_buf_type_use, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* NOTE: This shouldn't execute since the sup code path calls this
|
||||||
|
function only once, before *any* loops of the gemm algorithm are
|
||||||
|
encountered. */ \
|
||||||
|
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||||
|
\
|
||||||
|
/* If the mem_t entry provided by the caller does NOT contain a NULL
|
||||||
|
buffer, then a block has already been acquired from the memory
|
||||||
|
broker and cached by the caller. */ \
|
||||||
|
\
|
||||||
|
/* As a sanity check, we should make sure that the mem_t object isn't
|
||||||
|
associated with a block that is too small compared to the size of
|
||||||
|
the packed matrix buffer that is needed, according to the value
|
||||||
|
computed above. */ \
|
||||||
|
siz_t mem_size = bli_mem_size( mem ); \
|
||||||
|
\
|
||||||
|
if ( mem_size < size_needed ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_release \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
bli_membrk_acquire_m \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
size_needed, \
|
||||||
|
pack_buf_type_use, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* If the mem_t entry is already allocated and sufficiently large,
|
||||||
|
then we use it as-is. No action is needed. */ \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t did_pack, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we previously packed matrix A. */ \
|
||||||
|
if ( did_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
/* If we didn't pack matrix A, there's nothing to be done. */ \
|
||||||
|
} \
|
||||||
|
else /* if ( did_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
/* Check the mem_t entry provided by the caller. Only proceed if it
|
||||||
|
is allocated, which it should be. */ \
|
||||||
|
if ( bli_mem_is_alloc( mem ) ) \
|
||||||
|
{ \
|
||||||
|
bli_membrk_release \
|
||||||
|
( \
|
||||||
|
rntm, \
|
||||||
|
mem \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
pack_t* restrict schema, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
dim_t* restrict k_max, \
|
||||||
|
dim_t* restrict n_max, \
|
||||||
|
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||||
|
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* Inspect whether we are going to be packing matrix B. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
*k_max = k; \
|
||||||
|
*n_max = n; \
|
||||||
|
\
|
||||||
|
/* Set the parameters for use with no packing of B (ie: using the
|
||||||
|
source matrix B directly). */ \
|
||||||
|
{ \
|
||||||
|
/* Use the strides of the source matrix as the final values. */ \
|
||||||
|
*rs_p = rs_x; \
|
||||||
|
*cs_p = cs_x; \
|
||||||
|
\
|
||||||
|
*pd_p = nr; \
|
||||||
|
*ps_p = nr * cs_x; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "not packed" to indicate that packing will be
|
||||||
|
skipped. */ \
|
||||||
|
*schema = BLIS_NOT_PACKED; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Since we won't be packing, simply update the buffer address provided
|
||||||
|
by the caller to point to source matrix. */ \
|
||||||
|
*p = x; \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
/* NOTE: This is "rounding up" of the last upanel is actually optional
|
||||||
|
for the rrc/crc cases, but absolutely necessary for the other cases
|
||||||
|
since we NEED that last micropanel to have the same ldim (cs_p) as
|
||||||
|
the other micropanels. Why? So that millikernels can use the same
|
||||||
|
upanel ldim for all iterations of the ir loop. */ \
|
||||||
|
*k_max = k; \
|
||||||
|
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
|
||||||
|
\
|
||||||
|
/* Determine the dimensions and strides for the packed matrix B. */ \
|
||||||
|
if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) \
|
||||||
|
{ \
|
||||||
|
/* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \
|
||||||
|
*rs_p = 1; \
|
||||||
|
*cs_p = k; \
|
||||||
|
\
|
||||||
|
*pd_p = nr; \
|
||||||
|
*ps_p = k * nr; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "column packed" to indicate packing to plain
|
||||||
|
column storage. */ \
|
||||||
|
*schema = BLIS_PACKED_COLUMNS; \
|
||||||
|
} \
|
||||||
|
else \
|
||||||
|
{ \
|
||||||
|
/* All other stor3_t ids: pack A to column-stored row-panels. */ \
|
||||||
|
*rs_p = nr; \
|
||||||
|
*cs_p = 1; \
|
||||||
|
\
|
||||||
|
*pd_p = nr; \
|
||||||
|
*ps_p = k * nr; \
|
||||||
|
\
|
||||||
|
/* Set the schema to "packed row panels" to indicate packing to
|
||||||
|
conventional column-stored row panels. */ \
|
||||||
|
*schema = BLIS_PACKED_COL_PANELS; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Set the buffer address provided by the caller to point to the
|
||||||
|
memory associated with the mem_t entry acquired from the memory
|
||||||
|
broker. */ \
|
||||||
|
*p = bli_mem_buffer( mem ); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Define BLAS-like interfaces to the variant chooser.
|
||||||
|
//
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
trans_t transc, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||||
|
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
pack_t schema; \
|
||||||
|
dim_t k_max; \
|
||||||
|
dim_t n_max; \
|
||||||
|
dim_t pd_p; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix B. If B
|
||||||
|
will not be packed, then b_use will be set to point to b and the _b_use
|
||||||
|
strides will be set accordingly. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_b) \
|
||||||
|
( \
|
||||||
|
will_pack, \
|
||||||
|
stor_id, \
|
||||||
|
&schema, \
|
||||||
|
k, n, nr, \
|
||||||
|
&k_max, &n_max, \
|
||||||
|
b, rs_b, cs_b, \
|
||||||
|
p, rs_p, cs_p, \
|
||||||
|
&pd_p, ps_p, \
|
||||||
|
cntx, \
|
||||||
|
mem, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Inspect whether we are going to be packing matrix B. */ \
|
||||||
|
if ( will_pack == FALSE ) \
|
||||||
|
{ \
|
||||||
|
/* If we aren't going to pack matrix B, then there's nothing to do. */ \
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_b: not packing B.\n" ); \
|
||||||
|
*/ \
|
||||||
|
} \
|
||||||
|
else /* if ( will_pack == TRUE ) */ \
|
||||||
|
{ \
|
||||||
|
if ( schema == BLIS_PACKED_COLUMNS ) \
|
||||||
|
{ \
|
||||||
|
/* For plain packing by columns, use copym.
|
||||||
|
NOTE: We assume kappa = 1; otherwise, we need scal2m. */ \
|
||||||
|
\
|
||||||
|
/* NOTE: This call to copym must be replaced by a proper packm
|
||||||
|
variant, implemented as a loop over copym, once multithreading
|
||||||
|
support is added. */ \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
|
||||||
|
*/ \
|
||||||
|
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
|
||||||
|
( \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
transc, \
|
||||||
|
k, \
|
||||||
|
n, \
|
||||||
|
b, rs_b, cs_b, \
|
||||||
|
*p, *rs_p, *cs_p, \
|
||||||
|
cntx, \
|
||||||
|
NULL \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
|
||||||
|
{ \
|
||||||
|
/*
|
||||||
|
printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
|
||||||
|
*/ \
|
||||||
|
/* For packing to row-stored column panels, use var1. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_var1) \
|
||||||
|
( \
|
||||||
|
transc, \
|
||||||
|
schema, \
|
||||||
|
k, \
|
||||||
|
n, \
|
||||||
|
k_max, \
|
||||||
|
n_max, \
|
||||||
|
kappa, \
|
||||||
|
b, rs_b, cs_b, \
|
||||||
|
*p, *rs_p, *cs_p, \
|
||||||
|
pd_p, *ps_p, \
|
||||||
|
cntx, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( packm_sup_b )
|
||||||
|
|
||||||
115
frame/3/bli_l3_sup_packm_b.h
Normal file
115
frame/3/bli_l3_sup_packm_b.h
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
packbuf_t pack_buf_type, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t did_pack, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
pack_t* restrict schema, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
dim_t* restrict k_max, \
|
||||||
|
dim_t* restrict n_max, \
|
||||||
|
ctype* x, inc_t rs_x, inc_t cs_x, \
|
||||||
|
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
dim_t* restrict pd_p, inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_init_b )
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, opname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,opname) \
|
||||||
|
( \
|
||||||
|
bool_t will_pack, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
trans_t transc, \
|
||||||
|
dim_t k, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t nr, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict x, inc_t rs_x, inc_t cs_x, \
|
||||||
|
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
|
||||||
|
inc_t* restrict ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
mem_t* restrict mem, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
); \
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_b )
|
||||||
|
|
||||||
329
frame/3/bli_l3_sup_packm_var.c
Normal file
329
frame/3/bli_l3_sup_packm_var.c
Normal file
@@ -0,0 +1,329 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
//
|
||||||
|
// Define BLAS-like interfaces to the variants.
|
||||||
|
//
|
||||||
|
|
||||||
|
#undef GENTFUNCR
|
||||||
|
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
trans_t transc, \
|
||||||
|
pack_t schema, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t m_max, \
|
||||||
|
dim_t n_max, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
|
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||||
|
dim_t pd_p, inc_t ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
ctype* restrict kappa_cast = kappa; \
|
||||||
|
ctype* restrict c_cast = c; \
|
||||||
|
ctype* restrict p_cast = p; \
|
||||||
|
\
|
||||||
|
dim_t iter_dim; \
|
||||||
|
dim_t n_iter; \
|
||||||
|
dim_t it, ic; \
|
||||||
|
dim_t ic0; \
|
||||||
|
doff_t ic_inc; \
|
||||||
|
dim_t panel_len_full; \
|
||||||
|
dim_t panel_len_i; \
|
||||||
|
dim_t panel_len_max; \
|
||||||
|
dim_t panel_len_max_i; \
|
||||||
|
dim_t panel_dim_i; \
|
||||||
|
dim_t panel_dim_max; \
|
||||||
|
inc_t vs_c; \
|
||||||
|
inc_t ldc; \
|
||||||
|
inc_t ldp, p_inc; \
|
||||||
|
conj_t conjc; \
|
||||||
|
\
|
||||||
|
\
|
||||||
|
/* Extract the conjugation bit from the transposition argument. */ \
|
||||||
|
conjc = bli_extract_conj( transc ); \
|
||||||
|
\
|
||||||
|
/* If c needs a transposition, induce it so that we can more simply
|
||||||
|
express the remaining parameters and code. */ \
|
||||||
|
if ( bli_does_trans( transc ) ) \
|
||||||
|
{ \
|
||||||
|
bli_swap_incs( &rs_c, &cs_c ); \
|
||||||
|
bli_toggle_trans( &transc ); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Create flags to incidate row or column storage. Note that the
|
||||||
|
schema bit that encodes row or column is describing the form of
|
||||||
|
micro-panel, not the storage in the micro-panel. Hence the
|
||||||
|
mismatch in "row" and "column" semantics. */ \
|
||||||
|
bool_t row_stored = bli_is_col_packed( schema ); \
|
||||||
|
/*bool_t col_stored = bli_is_row_packed( schema );*/ \
|
||||||
|
\
|
||||||
|
/* If the row storage flag indicates row storage, then we are packing
|
||||||
|
to column panels; otherwise, if the strides indicate column storage,
|
||||||
|
we are packing to row panels. */ \
|
||||||
|
if ( row_stored ) \
|
||||||
|
{ \
|
||||||
|
/* Prepare to pack to row-stored column panels. */ \
|
||||||
|
iter_dim = n; \
|
||||||
|
panel_len_full = m; \
|
||||||
|
panel_len_max = m_max; \
|
||||||
|
panel_dim_max = pd_p; \
|
||||||
|
vs_c = cs_c; \
|
||||||
|
ldc = rs_c; \
|
||||||
|
ldp = rs_p; \
|
||||||
|
} \
|
||||||
|
else /* if ( col_stored ) */ \
|
||||||
|
{ \
|
||||||
|
/* Prepare to pack to column-stored row panels. */ \
|
||||||
|
iter_dim = m; \
|
||||||
|
panel_len_full = n; \
|
||||||
|
panel_len_max = n_max; \
|
||||||
|
panel_dim_max = pd_p; \
|
||||||
|
vs_c = rs_c; \
|
||||||
|
ldc = cs_c; \
|
||||||
|
ldp = cs_p; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Compute the total number of iterations we'll need. */ \
|
||||||
|
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||||
|
\
|
||||||
|
/* Set the initial values and increments for indices related to C and P
|
||||||
|
based on whether reverse iteration was requested. */ \
|
||||||
|
{ \
|
||||||
|
ic0 = 0; \
|
||||||
|
ic_inc = panel_dim_max; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
ctype* restrict p_begin = p_cast; \
|
||||||
|
\
|
||||||
|
/* Query the number of threads and thread ids from the current thread's
|
||||||
|
packm thrinfo_t node. */ \
|
||||||
|
const dim_t nt = bli_thread_n_way( thread ); \
|
||||||
|
const dim_t tid = bli_thread_work_id( thread ); \
|
||||||
|
\
|
||||||
|
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
|
||||||
|
( void )nt; \
|
||||||
|
( void )tid; \
|
||||||
|
\
|
||||||
|
dim_t it_start, it_end, it_inc; \
|
||||||
|
\
|
||||||
|
/* Determine the thread range and increment using the current thread's
|
||||||
|
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
|
||||||
|
will depend on whether slab or round-robin partitioning was requested
|
||||||
|
at configure-time. */ \
|
||||||
|
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
|
||||||
|
\
|
||||||
|
/* Iterate over every logical micropanel in the source matrix. */ \
|
||||||
|
for ( ic = ic0, it = 0; it < n_iter; \
|
||||||
|
ic += ic_inc, it += 1 ) \
|
||||||
|
{ \
|
||||||
|
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||||
|
\
|
||||||
|
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
|
||||||
|
\
|
||||||
|
ctype* restrict c_use = c_begin; \
|
||||||
|
ctype* restrict p_use = p_begin; \
|
||||||
|
\
|
||||||
|
{ \
|
||||||
|
panel_len_i = panel_len_full; \
|
||||||
|
panel_len_max_i = panel_len_max; \
|
||||||
|
\
|
||||||
|
/* The definition of bli_packm_my_iter() will depend on whether slab
|
||||||
|
or round-robin partitioning was requested at configure-time. */ \
|
||||||
|
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
|
||||||
|
{ \
|
||||||
|
PASTEMAC(ch,packm_cxk) \
|
||||||
|
( \
|
||||||
|
conjc, \
|
||||||
|
schema, \
|
||||||
|
panel_dim_i, \
|
||||||
|
panel_dim_max, \
|
||||||
|
panel_len_i, \
|
||||||
|
panel_len_max_i, \
|
||||||
|
kappa_cast, \
|
||||||
|
c_use, vs_c, ldc, \
|
||||||
|
p_use, ldp, \
|
||||||
|
cntx \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* NOTE: This value is equivalent to ps_p. */ \
|
||||||
|
p_inc = ps_p; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
p_begin += p_inc; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
if ( row_stored ) \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
|
||||||
|
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||||
|
if ( !row_stored ) \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
|
||||||
|
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||||
|
*/ \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
if ( row_stored ) \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
|
||||||
|
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
if ( col_stored ) \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
|
||||||
|
c_cast, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
if ( row_stored ) \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
|
||||||
|
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||||
|
else \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
|
||||||
|
p_use, rs_p, cs_p, "%5.2f", "" ); \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
if ( col_stored ) { \
|
||||||
|
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||||
|
{ \
|
||||||
|
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
} \
|
||||||
|
bli_thread_obarrier( thread ); \
|
||||||
|
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||||
|
{ \
|
||||||
|
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
} \
|
||||||
|
bli_thread_obarrier( thread ); \
|
||||||
|
} \
|
||||||
|
else { \
|
||||||
|
if ( bli_thread_work_id( thread ) == 0 ) \
|
||||||
|
{ \
|
||||||
|
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
} \
|
||||||
|
bli_thread_obarrier( thread ); \
|
||||||
|
if ( bli_thread_work_id( thread ) == 1 ) \
|
||||||
|
{ \
|
||||||
|
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
fflush( stdout ); \
|
||||||
|
} \
|
||||||
|
bli_thread_obarrier( thread ); \
|
||||||
|
} \
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
if ( bli_is_4mi_packed( schema ) ) { \
|
||||||
|
printf( "packm_var2: is_p_use = %lu\n", is_p_use ); \
|
||||||
|
if ( col_stored ) { \
|
||||||
|
if ( 0 ) \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
} \
|
||||||
|
if ( row_stored ) { \
|
||||||
|
if ( 0 ) \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_use, *n_panel_use, \
|
||||||
|
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use + is_p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
if ( row_stored ) { \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
|
||||||
|
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
inc_t is_b = rs_p * *m_panel_max; \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
} \
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
if ( col_stored ) { \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
|
||||||
|
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
|
||||||
|
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||||
|
} \
|
||||||
|
*/
|
||||||
60
frame/3/bli_l3_sup_packm_var.h
Normal file
60
frame/3/bli_l3_sup_packm_var.h
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
//
|
||||||
|
// Prototype BLAS-like interfaces to the variants.
|
||||||
|
//
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
trans_t transc, \
|
||||||
|
pack_t schema, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t m_max, \
|
||||||
|
dim_t n_max, \
|
||||||
|
ctype* restrict kappa, \
|
||||||
|
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
|
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
|
||||||
|
dim_t pd_p, inc_t ps_p, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT_GENTPROT_BASIC0( packm_sup_var1 )
|
||||||
|
|
||||||
@@ -45,22 +45,15 @@ err_t bli_gemmsup_ref
|
|||||||
rntm_t* rntm
|
rntm_t* rntm
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
// This function implements the default gemmsup handler. If you are a
|
||||||
|
// BLIS developer and wish to use a different gemmsup handler, please
|
||||||
|
// register a different function pointer in the context in your
|
||||||
|
// sub-configuration's bli_cntx_init_*() function.
|
||||||
|
|
||||||
// Check parameters.
|
// Check parameters.
|
||||||
if ( bli_error_checking_is_enabled() )
|
if ( bli_error_checking_is_enabled() )
|
||||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||||
|
|
||||||
#if 0
|
|
||||||
// FGVZ: The datatype-specific variant is now responsible for checking for
|
|
||||||
// alpha == 0.0.
|
|
||||||
|
|
||||||
// If alpha is zero, scale by beta and return.
|
|
||||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
|
|
||||||
{
|
|
||||||
bli_scalm( beta, c );
|
|
||||||
return BLIS_SUCCESS;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
|
// FGVZ: Will this be needed for constructing thrinfo_t's (recall: the
|
||||||
// sba needs to be attached to the rntm; see below)? Or will those nodes
|
// sba needs to be attached to the rntm; see below)? Or will those nodes
|
||||||
@@ -85,125 +78,33 @@ err_t bli_gemmsup_ref
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// FGVZ: The datatype-specific variant is now responsible for inducing a
|
printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) );
|
||||||
// transposition, if needed.
|
printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) );
|
||||||
|
|
||||||
// Induce transpositions on A and/or B if either object is marked for
|
|
||||||
// transposition. We can induce "fast" transpositions since they objects
|
|
||||||
// are guaranteed to not have structure or be packed.
|
|
||||||
if ( bli_obj_has_trans( a ) )
|
|
||||||
{
|
|
||||||
bli_obj_induce_fast_trans( a );
|
|
||||||
bli_obj_toggle_trans( a );
|
|
||||||
}
|
|
||||||
if ( bli_obj_has_trans( b ) )
|
|
||||||
{
|
|
||||||
bli_obj_induce_fast_trans( b );
|
|
||||||
bli_obj_toggle_trans( b );
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
//bli_rntm_set_pack_a( 0, rntm );
|
||||||
|
//bli_rntm_set_pack_b( 0, rntm );
|
||||||
|
|
||||||
#if 0
|
// May not need these here since packm_sup infers the schemas based
|
||||||
//bli_gemmsup_ref_var2
|
// on the stor3_t id. (This would also mean that they don't need to
|
||||||
//bli_gemmsup_ref_var1
|
// be passed into the thread decorator below.)
|
||||||
#if 0
|
//pack_t schema_a = BLIS_PACKED_ROW_PANELS;
|
||||||
bli_gemmsup_ref_var1n
|
//pack_t schema_b = BLIS_PACKED_COL_PANELS;
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
|
||||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
|
||||||
stor_id == BLIS_RRC ||
|
|
||||||
stor_id == BLIS_RCR ||
|
|
||||||
stor_id == BLIS_CRR );
|
|
||||||
if ( is_rrr_rrc_rcr_crr )
|
|
||||||
{
|
|
||||||
bli_gemmsup_ref_var2m
|
|
||||||
(
|
|
||||||
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
|
||||||
);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
bli_gemmsup_ref_var2m
|
|
||||||
(
|
|
||||||
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
|
|
||||||
);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
|
|
||||||
|
|
||||||
// Don't use the small/unpacked implementation if one of the matrices
|
|
||||||
// uses general stride.
|
|
||||||
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
|
|
||||||
|
|
||||||
const bool_t is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
|
return
|
||||||
stor_id == BLIS_RRC ||
|
bli_l3_sup_thread_decorator
|
||||||
stor_id == BLIS_RCR ||
|
(
|
||||||
stor_id == BLIS_CRR );
|
bli_gemmsup_int,
|
||||||
const bool_t is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
|
BLIS_GEMM, // operation family id
|
||||||
|
//schema_a,
|
||||||
const num_t dt = bli_obj_dt( c );
|
//schema_b,
|
||||||
const bool_t row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
|
alpha,
|
||||||
|
a,
|
||||||
const bool_t is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
|
b,
|
||||||
: is_rcc_crc_ccr_ccc );
|
beta,
|
||||||
|
c,
|
||||||
if ( is_primary )
|
cntx,
|
||||||
{
|
rntm
|
||||||
// This branch handles:
|
);
|
||||||
// - rrr rrc rcr crr for row-preferential kernels
|
|
||||||
// - rcc crc ccr ccc for column-preferential kernels
|
|
||||||
|
|
||||||
const dim_t m = bli_obj_length( c );
|
|
||||||
const dim_t n = bli_obj_width( c );
|
|
||||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
|
||||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
|
||||||
const dim_t mu = m / MR;
|
|
||||||
const dim_t nu = n / NR;
|
|
||||||
|
|
||||||
if ( mu >= nu )
|
|
||||||
{
|
|
||||||
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
|
|
||||||
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
|
|
||||||
alpha, a, b, beta, c, stor_id, cntx, rntm );
|
|
||||||
}
|
|
||||||
else // if ( mu < nu )
|
|
||||||
{
|
|
||||||
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
|
|
||||||
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
|
|
||||||
alpha, a, b, beta, c, stor_id, cntx, rntm );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// This branch handles:
|
|
||||||
// - rrr rrc rcr crr for column-preferential kernels
|
|
||||||
// - rcc crc ccr ccc for row-preferential kernels
|
|
||||||
|
|
||||||
const dim_t mt = bli_obj_width( c );
|
|
||||||
const dim_t nt = bli_obj_length( c );
|
|
||||||
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
|
||||||
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
|
||||||
const dim_t mu = mt / MR;
|
|
||||||
const dim_t nu = nt / NR;
|
|
||||||
|
|
||||||
if ( mu >= nu )
|
|
||||||
{
|
|
||||||
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
|
|
||||||
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
|
|
||||||
alpha, a, b, beta, c, stor_id, cntx, rntm );
|
|
||||||
}
|
|
||||||
else // if ( mu < nu )
|
|
||||||
{
|
|
||||||
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
|
|
||||||
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
|
|
||||||
alpha, a, b, beta, c, stor_id, cntx, rntm );
|
|
||||||
}
|
|
||||||
// *requires nudging of mc,nc up to be a multiple of nr,mr.
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Return success so that the caller knows that we computed the solution.
|
|
||||||
return BLIS_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,6 +38,8 @@
|
|||||||
|
|
||||||
typedef void (*FUNCPTR_T)
|
typedef void (*FUNCPTR_T)
|
||||||
(
|
(
|
||||||
|
bool_t packa,
|
||||||
|
bool_t packb,
|
||||||
conj_t conja,
|
conj_t conja,
|
||||||
conj_t conjb,
|
conj_t conjb,
|
||||||
dim_t m,
|
dim_t m,
|
||||||
@@ -50,7 +52,9 @@ typedef void (*FUNCPTR_T)
|
|||||||
void* restrict c, inc_t rs_c, inc_t cs_c,
|
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||||
stor3_t eff_id,
|
stor3_t eff_id,
|
||||||
cntx_t* restrict cntx,
|
cntx_t* restrict cntx,
|
||||||
rntm_t* restrict rntm
|
rntm_t* restrict rntm,
|
||||||
|
cntl_t* restrict cntl,
|
||||||
|
thrinfo_t* restrict thread
|
||||||
);
|
);
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -69,7 +73,9 @@ void bli_gemmsup_ref_var1n
|
|||||||
obj_t* c,
|
obj_t* c,
|
||||||
stor3_t eff_id,
|
stor3_t eff_id,
|
||||||
cntx_t* cntx,
|
cntx_t* cntx,
|
||||||
rntm_t* rntm
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
@@ -110,9 +116,11 @@ void bli_gemmsup_ref_var1n
|
|||||||
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const num_t dt_exec = bli_obj_dt( c );
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||||
|
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||||
|
|
||||||
const conj_t conja = bli_obj_conj_status( a );
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
const conj_t conjb = bli_obj_conj_status( b );
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
@@ -174,6 +182,8 @@ void bli_gemmsup_ref_var1n
|
|||||||
// Invoke the function.
|
// Invoke the function.
|
||||||
f
|
f
|
||||||
(
|
(
|
||||||
|
packa,
|
||||||
|
packb,
|
||||||
conja,
|
conja,
|
||||||
conjb,
|
conjb,
|
||||||
m,
|
m,
|
||||||
@@ -186,7 +196,9 @@ void bli_gemmsup_ref_var1n
|
|||||||
buf_c, rs_c, cs_c,
|
buf_c, rs_c, cs_c,
|
||||||
eff_id,
|
eff_id,
|
||||||
cntx,
|
cntx,
|
||||||
rntm
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -194,6 +206,8 @@ void bli_gemmsup_ref_var1n
|
|||||||
// Invoke the function (transposing the operation).
|
// Invoke the function (transposing the operation).
|
||||||
f
|
f
|
||||||
(
|
(
|
||||||
|
packb,
|
||||||
|
packa,
|
||||||
conjb, // swap the conj values.
|
conjb, // swap the conj values.
|
||||||
conja,
|
conja,
|
||||||
n, // swap the m and n dimensions.
|
n, // swap the m and n dimensions.
|
||||||
@@ -206,7 +220,9 @@ void bli_gemmsup_ref_var1n
|
|||||||
buf_c, cs_c, rs_c, // swap the strides of C.
|
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||||
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||||
cntx,
|
cntx,
|
||||||
rntm
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -217,6 +233,8 @@ void bli_gemmsup_ref_var1n
|
|||||||
\
|
\
|
||||||
void PASTEMAC(ch,varname) \
|
void PASTEMAC(ch,varname) \
|
||||||
( \
|
( \
|
||||||
|
bool_t packa, \
|
||||||
|
bool_t packb, \
|
||||||
conj_t conja, \
|
conj_t conja, \
|
||||||
conj_t conjb, \
|
conj_t conjb, \
|
||||||
dim_t m, \
|
dim_t m, \
|
||||||
@@ -229,9 +247,13 @@ void PASTEMAC(ch,varname) \
|
|||||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
stor3_t stor_id, \
|
stor3_t stor_id, \
|
||||||
cntx_t* restrict cntx, \
|
cntx_t* restrict cntx, \
|
||||||
rntm_t* restrict rntm \
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
) \
|
) \
|
||||||
{ \
|
{ \
|
||||||
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
|
\
|
||||||
/* If m or n is zero, return immediately. */ \
|
/* If m or n is zero, return immediately. */ \
|
||||||
if ( bli_zero_dim2( m, n ) ) return; \
|
if ( bli_zero_dim2( m, n ) ) return; \
|
||||||
\
|
\
|
||||||
@@ -250,16 +272,16 @@ void PASTEMAC(ch,varname) \
|
|||||||
); \
|
); \
|
||||||
return; \
|
return; \
|
||||||
} \
|
} \
|
||||||
\
|
|
||||||
const num_t dt = PASTEMAC(ch,type); \
|
|
||||||
\
|
\
|
||||||
/* This transposition of the stor3_t id value is inherent to variant 1.
|
/* This transposition of the stor3_t id value is inherent to variant 1.
|
||||||
The reason: we assume that variant 2 is the "main" variant. The
|
The reason: we assume that variant 2 is the "main" variant. The
|
||||||
consequence of this is that we assume that the millikernels that
|
consequence of this is that we assume that the millikernels that
|
||||||
iterate over m are registered to the kernel group associated with
|
iterate over m are registered to the "primary" kernel group associated
|
||||||
the kernel preference. So, regardless of whether the mkernels are
|
with the kernel IO preference; similarly, mkernels that iterate over
|
||||||
row- or column-preferential, millikernels that iterate over n are
|
n are assumed to be registered to the "non-primary" group associated
|
||||||
always placed in the slots for the opposite kernel group. */ \
|
with the ("non-primary") anti-preference. Note that this pattern holds
|
||||||
|
regardless of whether the mkernel set has a row or column preference.)
|
||||||
|
See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
|
||||||
stor_id = bli_stor3_trans( stor_id ); \
|
stor_id = bli_stor3_trans( stor_id ); \
|
||||||
\
|
\
|
||||||
/* Query the context for various blocksizes. */ \
|
/* Query the context for various blocksizes. */ \
|
||||||
@@ -279,7 +301,9 @@ void PASTEMAC(ch,varname) \
|
|||||||
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
||||||
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||||
\
|
\
|
||||||
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
|
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
|
||||||
|
NOTE: This is unique to variant 1 (ie: not performed in variant 2)
|
||||||
|
because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
|
||||||
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
|
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
|
||||||
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
|
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
|
||||||
\
|
\
|
||||||
@@ -299,7 +323,11 @@ void PASTEMAC(ch,varname) \
|
|||||||
const inc_t icstep_b = cs_b * MC; \
|
const inc_t icstep_b = cs_b * MC; \
|
||||||
\
|
\
|
||||||
const inc_t jrstep_c = rs_c * MR; \
|
const inc_t jrstep_c = rs_c * MR; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
const inc_t jrstep_a = rs_a * MR; \
|
const inc_t jrstep_a = rs_a * MR; \
|
||||||
|
( void )jrstep_a; \
|
||||||
|
*/ \
|
||||||
\
|
\
|
||||||
/*
|
/*
|
||||||
const inc_t irstep_c = cs_c * NR; \
|
const inc_t irstep_c = cs_c * NR; \
|
||||||
@@ -346,6 +374,45 @@ void PASTEMAC(ch,varname) \
|
|||||||
/*
|
/*
|
||||||
const dim_t ir_inc = 1; \
|
const dim_t ir_inc = 1; \
|
||||||
*/ \
|
*/ \
|
||||||
|
\
|
||||||
|
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||||
|
needed for the matrix we will be packing (if any), but we do it
|
||||||
|
unconditionally to be safe. An alternative way of initializing the
|
||||||
|
mem_t entries is:
|
||||||
|
|
||||||
|
bli_mem_clear( &mem_a ); \
|
||||||
|
bli_mem_clear( &mem_b ); \
|
||||||
|
*/ \
|
||||||
|
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||||
|
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||||
|
\
|
||||||
|
/* Prepare the packing destination buffer. If packing is not requested for
|
||||||
|
matrix B, this function will reduce to a no-op. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to a "panel of B". */ \
|
||||||
|
stor_id, \
|
||||||
|
NC, KC, MR, /* Note this "panel of B" is NC x KC. */ \
|
||||||
|
cntx, \
|
||||||
|
rntm, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Prepare the packing destination buffer. If packing is not requested for
|
||||||
|
matrix B, this function will reduce to a no-op. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to a "block of A". */ \
|
||||||
|
stor_id, \
|
||||||
|
KC, MC, NR, /* Note this "block of A" is KC x MC. */ \
|
||||||
|
cntx, \
|
||||||
|
rntm, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
\
|
\
|
||||||
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||||
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||||
@@ -358,11 +425,12 @@ void PASTEMAC(ch,varname) \
|
|||||||
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
||||||
dim_t jr_left = nc_cur % MR; \
|
dim_t jr_left = nc_cur % MR; \
|
||||||
\
|
\
|
||||||
/* An optimization: allow the last jr iteration to contain up to MRX
|
/* An optimization: allow the last jr iteration to contain up to MRE
|
||||||
rows of C and A. (If MRX > MR, the mkernel has agreed to handle
|
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
|
||||||
these cases.) Note that this prevents us from declaring jr_iter and
|
these cases.) Note that this prevents us from declaring jr_iter and
|
||||||
jr_left as const. */ \
|
jr_left as const. NOTE: We forgo this optimization when packing A
|
||||||
if ( 1 ) \
|
since packing an extended edge case is not yet supported. */ \
|
||||||
|
if ( !packa ) \
|
||||||
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
||||||
{ \
|
{ \
|
||||||
jr_iter--; jr_left += MR; \
|
jr_iter--; jr_left += MR; \
|
||||||
@@ -378,6 +446,39 @@ void PASTEMAC(ch,varname) \
|
|||||||
\
|
\
|
||||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||||
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||||
|
\
|
||||||
|
ctype* a_use; \
|
||||||
|
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix
|
||||||
|
A. (If A will not be packed, then a_use will be set to point to
|
||||||
|
a and the _a_use strides will be set accordingly.) Then call
|
||||||
|
the packm sup variant chooser, which will call the appropriate
|
||||||
|
implementation based on the schema deduced from the stor_id. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
stor_id, \
|
||||||
|
BLIS_NO_TRANSPOSE, \
|
||||||
|
nc_cur, kc_cur, MR, \
|
||||||
|
one, \
|
||||||
|
a_pc, rs_a, cs_a, \
|
||||||
|
&a_use, &rs_a_use, &cs_a_use, \
|
||||||
|
&ps_a_use, \
|
||||||
|
cntx, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Alias a_use so that it's clear this is our current block of
|
||||||
|
matrix B. */ \
|
||||||
|
ctype* restrict a_pc_use = a_use; \
|
||||||
|
\
|
||||||
|
/* We don't need to embed the panel stride of A within the auxinfo_t
|
||||||
|
object because this variant iterates through A in the jr loop,
|
||||||
|
which occurs here, within the macrokernel, not within the
|
||||||
|
millikernel. */ \
|
||||||
|
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
|
||||||
\
|
\
|
||||||
/* Loop over the n dimension (MC rows at a time). */ \
|
/* Loop over the n dimension (MC rows at a time). */ \
|
||||||
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||||
@@ -391,14 +492,52 @@ void PASTEMAC(ch,varname) \
|
|||||||
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
|
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
|
||||||
const dim_t ir_left = mc_cur % NR; \
|
const dim_t ir_left = mc_cur % NR; \
|
||||||
*/ \
|
*/ \
|
||||||
|
\
|
||||||
|
ctype* b_use; \
|
||||||
|
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix
|
||||||
|
B. (If B will not be packed, then b_use will be set to point to
|
||||||
|
b and the _b_use strides will be set accordingly.) Then call
|
||||||
|
the packm sup variant chooser, which will call the appropriate
|
||||||
|
implementation based on the schema deduced from the stor_id.
|
||||||
|
NOTE: packing matrix B in this panel-block algorithm corresponds
|
||||||
|
to packing matrix A in the block-panel algorithm. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
stor_id, \
|
||||||
|
BLIS_NO_TRANSPOSE, \
|
||||||
|
kc_cur, mc_cur, NR, \
|
||||||
|
one, \
|
||||||
|
b_ic, rs_b, cs_b, \
|
||||||
|
&b_use, &rs_b_use, &cs_b_use, \
|
||||||
|
&ps_b_use, \
|
||||||
|
cntx, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Alias b_use so that it's clear this is our current block of
|
||||||
|
matrix B. */ \
|
||||||
|
ctype* restrict b_ic_use = b_use; \
|
||||||
|
\
|
||||||
|
/* Embed the panel stride of B within the auxinfo_t object. The
|
||||||
|
millikernel will query and use this to iterate through
|
||||||
|
micropanels of B. */ \
|
||||||
|
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
|
||||||
|
\
|
||||||
\
|
\
|
||||||
/* Loop over the m dimension (NR columns at a time). */ \
|
/* Loop over the m dimension (NR columns at a time). */ \
|
||||||
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||||
{ \
|
{ \
|
||||||
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
|
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
|
||||||
\
|
\
|
||||||
|
/*
|
||||||
ctype* restrict a_jr = a_pc + j * jrstep_a; \
|
ctype* restrict a_jr = a_pc + j * jrstep_a; \
|
||||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
*/ \
|
||||||
|
ctype* restrict a_jr = a_pc_use + j * ps_a_use; \
|
||||||
|
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||||
\
|
\
|
||||||
/* Loop over the n dimension (MR rows at a time). */ \
|
/* Loop over the n dimension (MR rows at a time). */ \
|
||||||
{ \
|
{ \
|
||||||
@@ -411,10 +550,10 @@ void PASTEMAC(ch,varname) \
|
|||||||
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
|
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
|
||||||
kc_cur, \
|
kc_cur, \
|
||||||
alpha_cast, \
|
alpha_cast, \
|
||||||
a_jr, rs_a, cs_a, \
|
a_jr, rs_a_use, cs_a_use, \
|
||||||
b_ic, rs_b, cs_b, \
|
b_ic_use, rs_b_use, cs_b_use, \
|
||||||
beta_use, \
|
beta_use, \
|
||||||
c_jr, rs_c, cs_c, \
|
c_jr, rs_c, cs_c, \
|
||||||
&aux, \
|
&aux, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
@@ -423,6 +562,22 @@ void PASTEMAC(ch,varname) \
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
\
|
||||||
|
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_finalize_mem_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
rntm, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
PASTEMAC(ch,packm_sup_finalize_mem_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
rntm, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
\
|
\
|
||||||
/*
|
/*
|
||||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||||
@@ -450,7 +605,9 @@ void bli_gemmsup_ref_var2m
|
|||||||
obj_t* c,
|
obj_t* c,
|
||||||
stor3_t eff_id,
|
stor3_t eff_id,
|
||||||
cntx_t* cntx,
|
cntx_t* cntx,
|
||||||
rntm_t* rntm
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
@@ -493,6 +650,9 @@ void bli_gemmsup_ref_var2m
|
|||||||
#else
|
#else
|
||||||
const num_t dt_exec = bli_obj_dt( c );
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const bool_t packa = bli_rntm_pack_a( rntm );
|
||||||
|
const bool_t packb = bli_rntm_pack_b( rntm );
|
||||||
|
|
||||||
const conj_t conja = bli_obj_conj_status( a );
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
const conj_t conjb = bli_obj_conj_status( b );
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
@@ -554,6 +714,8 @@ void bli_gemmsup_ref_var2m
|
|||||||
// Invoke the function.
|
// Invoke the function.
|
||||||
f
|
f
|
||||||
(
|
(
|
||||||
|
packa,
|
||||||
|
packb,
|
||||||
conja,
|
conja,
|
||||||
conjb,
|
conjb,
|
||||||
m,
|
m,
|
||||||
@@ -566,7 +728,9 @@ void bli_gemmsup_ref_var2m
|
|||||||
buf_c, rs_c, cs_c,
|
buf_c, rs_c, cs_c,
|
||||||
eff_id,
|
eff_id,
|
||||||
cntx,
|
cntx,
|
||||||
rntm
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -574,6 +738,8 @@ void bli_gemmsup_ref_var2m
|
|||||||
// Invoke the function (transposing the operation).
|
// Invoke the function (transposing the operation).
|
||||||
f
|
f
|
||||||
(
|
(
|
||||||
|
packb, // swap the pack values.
|
||||||
|
packa,
|
||||||
conjb, // swap the conj values.
|
conjb, // swap the conj values.
|
||||||
conja,
|
conja,
|
||||||
n, // swap the m and n dimensions.
|
n, // swap the m and n dimensions.
|
||||||
@@ -586,7 +752,9 @@ void bli_gemmsup_ref_var2m
|
|||||||
buf_c, cs_c, rs_c, // swap the strides of C.
|
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||||
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||||
cntx,
|
cntx,
|
||||||
rntm
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -597,6 +765,8 @@ void bli_gemmsup_ref_var2m
|
|||||||
\
|
\
|
||||||
void PASTEMAC(ch,varname) \
|
void PASTEMAC(ch,varname) \
|
||||||
( \
|
( \
|
||||||
|
bool_t packa, \
|
||||||
|
bool_t packb, \
|
||||||
conj_t conja, \
|
conj_t conja, \
|
||||||
conj_t conjb, \
|
conj_t conjb, \
|
||||||
dim_t m, \
|
dim_t m, \
|
||||||
@@ -609,9 +779,13 @@ void PASTEMAC(ch,varname) \
|
|||||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
stor3_t stor_id, \
|
stor3_t stor_id, \
|
||||||
cntx_t* restrict cntx, \
|
cntx_t* restrict cntx, \
|
||||||
rntm_t* restrict rntm \
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
) \
|
) \
|
||||||
{ \
|
{ \
|
||||||
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
|
\
|
||||||
/* If m or n is zero, return immediately. */ \
|
/* If m or n is zero, return immediately. */ \
|
||||||
if ( bli_zero_dim2( m, n ) ) return; \
|
if ( bli_zero_dim2( m, n ) ) return; \
|
||||||
\
|
\
|
||||||
@@ -630,8 +804,6 @@ void PASTEMAC(ch,varname) \
|
|||||||
); \
|
); \
|
||||||
return; \
|
return; \
|
||||||
} \
|
} \
|
||||||
\
|
|
||||||
const num_t dt = PASTEMAC(ch,type); \
|
|
||||||
\
|
\
|
||||||
/* Query the context for various blocksizes. */ \
|
/* Query the context for various blocksizes. */ \
|
||||||
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||||
@@ -667,7 +839,11 @@ void PASTEMAC(ch,varname) \
|
|||||||
const inc_t icstep_a = rs_a * MC; \
|
const inc_t icstep_a = rs_a * MC; \
|
||||||
\
|
\
|
||||||
const inc_t jrstep_c = cs_c * NR; \
|
const inc_t jrstep_c = cs_c * NR; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
const inc_t jrstep_b = cs_b * NR; \
|
const inc_t jrstep_b = cs_b * NR; \
|
||||||
|
( void )jrstep_b; \
|
||||||
|
*/ \
|
||||||
\
|
\
|
||||||
/*
|
/*
|
||||||
const inc_t irstep_c = rs_c * MR; \
|
const inc_t irstep_c = rs_c * MR; \
|
||||||
@@ -714,6 +890,45 @@ void PASTEMAC(ch,varname) \
|
|||||||
/*
|
/*
|
||||||
const dim_t ir_inc = 1; \
|
const dim_t ir_inc = 1; \
|
||||||
*/ \
|
*/ \
|
||||||
|
\
|
||||||
|
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
|
||||||
|
needed for the matrix we will be packing (if any), but we do it
|
||||||
|
unconditionally to be safe. An alternative way of initializing the
|
||||||
|
mem_t entries is:
|
||||||
|
|
||||||
|
bli_mem_clear( &mem_a ); \
|
||||||
|
bli_mem_clear( &mem_b ); \
|
||||||
|
*/ \
|
||||||
|
mem_t mem_a = BLIS_MEM_INITIALIZER; \
|
||||||
|
mem_t mem_b = BLIS_MEM_INITIALIZER; \
|
||||||
|
\
|
||||||
|
/* Prepare the packing destination buffer. If packing is not requested for
|
||||||
|
matrix A, this function will reduce to a no-op. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_mem_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to a "block of A". */ \
|
||||||
|
stor_id, \
|
||||||
|
MC, KC, MR, /* Note this "block of A" is MC x KC. */ \
|
||||||
|
cntx, \
|
||||||
|
rntm, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Prepare the packing destination buffer. If packing is not requested for
|
||||||
|
matrix B, this function will reduce to a no-op. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_init_mem_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to a "panel of B". */ \
|
||||||
|
stor_id, \
|
||||||
|
KC, NC, NR, /* Note this "panel of B" is KC x NC. */ \
|
||||||
|
cntx, \
|
||||||
|
rntm, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
\
|
\
|
||||||
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||||
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||||
@@ -726,11 +941,12 @@ void PASTEMAC(ch,varname) \
|
|||||||
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||||
dim_t jr_left = nc_cur % NR; \
|
dim_t jr_left = nc_cur % NR; \
|
||||||
\
|
\
|
||||||
/* An optimization: allow the last jr iteration to contain up to NRX
|
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||||
columns of C and B. (If NRX > NR, the mkernel has agreed to handle
|
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||||
these cases.) Note that this prevents us from declaring jr_iter and
|
these cases.) Note that this prevents us from declaring jr_iter and
|
||||||
jr_left as const. */ \
|
jr_left as const. NOTE: We forgo this optimization when packing B
|
||||||
if ( 1 ) \
|
since packing an extended edge case is not yet supported. */ \
|
||||||
|
if ( !packb ) \
|
||||||
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||||
{ \
|
{ \
|
||||||
jr_iter--; jr_left += NR; \
|
jr_iter--; jr_left += NR; \
|
||||||
@@ -746,6 +962,39 @@ void PASTEMAC(ch,varname) \
|
|||||||
\
|
\
|
||||||
/* Only apply beta to the first iteration of the pc loop. */ \
|
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||||
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||||
|
\
|
||||||
|
ctype* b_use; \
|
||||||
|
inc_t rs_b_use, cs_b_use, ps_b_use; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix
|
||||||
|
B. (If B will not be packed, then a_use will be set to point to
|
||||||
|
b and the _b_use strides will be set accordingly.) Then call
|
||||||
|
the packm sup variant chooser, which will call the appropriate
|
||||||
|
implementation based on the schema deduced from the stor_id. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
stor_id, \
|
||||||
|
BLIS_NO_TRANSPOSE, \
|
||||||
|
kc_cur, nc_cur, NR, \
|
||||||
|
one, \
|
||||||
|
b_pc, rs_b, cs_b, \
|
||||||
|
&b_use, &rs_b_use, &cs_b_use, \
|
||||||
|
&ps_b_use, \
|
||||||
|
cntx, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Alias a_use so that it's clear this is our current block of
|
||||||
|
matrix B. */ \
|
||||||
|
ctype* restrict b_pc_use = b_use; \
|
||||||
|
\
|
||||||
|
/* We don't need to embed the panel stride of B within the auxinfo_t
|
||||||
|
object because this variant iterates through B in the jr loop,
|
||||||
|
which occurs here, within the macrokernel, not within the
|
||||||
|
millikernel. */ \
|
||||||
|
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
|
||||||
\
|
\
|
||||||
/* Loop over the m dimension (MC rows at a time). */ \
|
/* Loop over the m dimension (MC rows at a time). */ \
|
||||||
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||||
@@ -759,14 +1008,49 @@ void PASTEMAC(ch,varname) \
|
|||||||
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||||
const dim_t ir_left = mc_cur % MR; \
|
const dim_t ir_left = mc_cur % MR; \
|
||||||
*/ \
|
*/ \
|
||||||
|
\
|
||||||
|
ctype* a_use; \
|
||||||
|
inc_t rs_a_use, cs_a_use, ps_a_use; \
|
||||||
|
\
|
||||||
|
/* Determine the packing buffer and related parameters for matrix
|
||||||
|
A. (If A will not be packed, then a_use will be set to point to
|
||||||
|
a and the _a_use strides will be set accordingly.) Then call
|
||||||
|
the packm sup variant chooser, which will call the appropriate
|
||||||
|
implementation based on the schema deduced from the stor_id. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
stor_id, \
|
||||||
|
BLIS_NO_TRANSPOSE, \
|
||||||
|
mc_cur, kc_cur, MR, \
|
||||||
|
one, \
|
||||||
|
a_ic, rs_a, cs_a, \
|
||||||
|
&a_use, &rs_a_use, &cs_a_use, \
|
||||||
|
&ps_a_use, \
|
||||||
|
cntx, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
\
|
||||||
|
/* Alias a_use so that it's clear this is our current block of
|
||||||
|
matrix A. */ \
|
||||||
|
ctype* restrict a_ic_use = a_use; \
|
||||||
|
\
|
||||||
|
/* Embed the panel stride of A within the auxinfo_t object. The
|
||||||
|
millikernel will query and use this to iterate through
|
||||||
|
micropanels of A (if needed). */ \
|
||||||
|
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
|
||||||
\
|
\
|
||||||
/* Loop over the n dimension (NR columns at a time). */ \
|
/* Loop over the n dimension (NR columns at a time). */ \
|
||||||
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||||
{ \
|
{ \
|
||||||
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||||
\
|
\
|
||||||
ctype* restrict b_jr = b_pc + j * jrstep_b; \
|
/*
|
||||||
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
ctype* restrict b_jr = b_pc_use + j * jrstep_b; \
|
||||||
|
*/ \
|
||||||
|
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
|
||||||
|
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||||
\
|
\
|
||||||
/* Loop over the m dimension (MR rows at a time). */ \
|
/* Loop over the m dimension (MR rows at a time). */ \
|
||||||
{ \
|
{ \
|
||||||
@@ -779,10 +1063,10 @@ void PASTEMAC(ch,varname) \
|
|||||||
nr_cur, \
|
nr_cur, \
|
||||||
kc_cur, \
|
kc_cur, \
|
||||||
alpha_cast, \
|
alpha_cast, \
|
||||||
a_ic, rs_a, cs_a, \
|
a_ic_use, rs_a_use, cs_a_use, \
|
||||||
b_jr, rs_b, cs_b, \
|
b_jr, rs_b_use, cs_b_use, \
|
||||||
beta_use, \
|
beta_use, \
|
||||||
c_jr, rs_c, cs_c, \
|
c_jr, rs_c, cs_c, \
|
||||||
&aux, \
|
&aux, \
|
||||||
cntx \
|
cntx \
|
||||||
); \
|
); \
|
||||||
@@ -791,6 +1075,22 @@ void PASTEMAC(ch,varname) \
|
|||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
|
\
|
||||||
|
/* Release any memory that was acquired for packing matrices A and B. */ \
|
||||||
|
PASTEMAC(ch,packm_sup_finalize_mem_a) \
|
||||||
|
( \
|
||||||
|
packa, \
|
||||||
|
rntm, \
|
||||||
|
&mem_a, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
|
PASTEMAC(ch,packm_sup_finalize_mem_b) \
|
||||||
|
( \
|
||||||
|
packb, \
|
||||||
|
rntm, \
|
||||||
|
&mem_b, \
|
||||||
|
thread \
|
||||||
|
); \
|
||||||
\
|
\
|
||||||
/*
|
/*
|
||||||
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||||
|
|||||||
@@ -50,7 +50,9 @@ void PASTEMAC0(opname) \
|
|||||||
obj_t* c, \
|
obj_t* c, \
|
||||||
stor3_t eff_id, \
|
stor3_t eff_id, \
|
||||||
cntx_t* cntx, \
|
cntx_t* cntx, \
|
||||||
rntm_t* rntm \
|
rntm_t* rntm, \
|
||||||
|
cntl_t* cntl, \
|
||||||
|
thrinfo_t* thread \
|
||||||
);
|
);
|
||||||
|
|
||||||
GENPROT( gemmsup_ref_var1 )
|
GENPROT( gemmsup_ref_var1 )
|
||||||
@@ -81,12 +83,38 @@ void PASTEMAC(ch,varname) \
|
|||||||
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
stor3_t eff_id, \
|
stor3_t eff_id, \
|
||||||
cntx_t* restrict cntx, \
|
cntx_t* restrict cntx, \
|
||||||
rntm_t* restrict rntm \
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
);
|
);
|
||||||
|
|
||||||
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
|
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
|
||||||
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
|
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
|
||||||
|
|
||||||
|
#undef GENTPROT
|
||||||
|
#define GENTPROT( ctype, ch, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
bool_t packa, \
|
||||||
|
bool_t packb, \
|
||||||
|
conj_t conja, \
|
||||||
|
conj_t conjb, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t k, \
|
||||||
|
void* restrict alpha, \
|
||||||
|
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||||
|
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||||
|
void* restrict beta, \
|
||||||
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
|
stor3_t eff_id, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
);
|
||||||
|
|
||||||
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
|
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
|
||||||
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
|
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
|
||||||
|
|
||||||
|
|||||||
821
frame/3/old/bli_l3_sup_var1n2m.c
Normal file
821
frame/3/old/bli_l3_sup_var1n2m.c
Normal file
@@ -0,0 +1,821 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#define FUNCPTR_T gemmsup_fp
|
||||||
|
|
||||||
|
typedef void (*FUNCPTR_T)
|
||||||
|
(
|
||||||
|
conj_t conja,
|
||||||
|
conj_t conjb,
|
||||||
|
dim_t m,
|
||||||
|
dim_t n,
|
||||||
|
dim_t k,
|
||||||
|
void* restrict alpha,
|
||||||
|
void* restrict a, inc_t rs_a, inc_t cs_a,
|
||||||
|
void* restrict b, inc_t rs_b, inc_t cs_b,
|
||||||
|
void* restrict beta,
|
||||||
|
void* restrict c, inc_t rs_c, inc_t cs_c,
|
||||||
|
stor3_t eff_id,
|
||||||
|
cntx_t* restrict cntx,
|
||||||
|
rntm_t* restrict rntm,
|
||||||
|
cntl_t* restrict cntl,
|
||||||
|
thrinfo_t* restrict thread
|
||||||
|
);
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- var1n --------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
|
static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
|
||||||
|
|
||||||
|
void bli_gemmsup_ref_var1n
|
||||||
|
(
|
||||||
|
trans_t trans,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
stor3_t eff_id,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
obj_t at, bt;
|
||||||
|
|
||||||
|
bli_obj_alias_to( a, &at );
|
||||||
|
bli_obj_alias_to( b, &bt );
|
||||||
|
|
||||||
|
// Induce transpositions on A and/or B if either object is marked for
|
||||||
|
// transposition. We can induce "fast" transpositions since they objects
|
||||||
|
// are guaranteed to not have structure or be packed.
|
||||||
|
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
|
||||||
|
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
|
||||||
|
|
||||||
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
|
const dim_t m = bli_obj_length( c );
|
||||||
|
const dim_t n = bli_obj_width( c );
|
||||||
|
|
||||||
|
const dim_t k = bli_obj_width( &at );
|
||||||
|
|
||||||
|
void* restrict buf_a = bli_obj_buffer_at_off( &at );
|
||||||
|
const inc_t rs_a = bli_obj_row_stride( &at );
|
||||||
|
const inc_t cs_a = bli_obj_col_stride( &at );
|
||||||
|
|
||||||
|
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
|
||||||
|
const inc_t rs_b = bli_obj_row_stride( &bt );
|
||||||
|
const inc_t cs_b = bli_obj_col_stride( &bt );
|
||||||
|
|
||||||
|
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||||
|
const inc_t rs_c = bli_obj_row_stride( c );
|
||||||
|
const inc_t cs_c = bli_obj_col_stride( c );
|
||||||
|
|
||||||
|
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||||
|
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
|
const dim_t m = bli_obj_length( c );
|
||||||
|
const dim_t n = bli_obj_width( c );
|
||||||
|
dim_t k;
|
||||||
|
|
||||||
|
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||||
|
inc_t rs_a;
|
||||||
|
inc_t cs_a;
|
||||||
|
|
||||||
|
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||||
|
inc_t rs_b;
|
||||||
|
inc_t cs_b;
|
||||||
|
|
||||||
|
if ( bli_obj_has_notrans( a ) )
|
||||||
|
{
|
||||||
|
k = bli_obj_width( a );
|
||||||
|
|
||||||
|
rs_a = bli_obj_row_stride( a );
|
||||||
|
cs_a = bli_obj_col_stride( a );
|
||||||
|
}
|
||||||
|
else // if ( bli_obj_has_trans( a ) )
|
||||||
|
{
|
||||||
|
// Assign the variables with an implicit transposition.
|
||||||
|
k = bli_obj_length( a );
|
||||||
|
|
||||||
|
rs_a = bli_obj_col_stride( a );
|
||||||
|
cs_a = bli_obj_row_stride( a );
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( bli_obj_has_notrans( b ) )
|
||||||
|
{
|
||||||
|
rs_b = bli_obj_row_stride( b );
|
||||||
|
cs_b = bli_obj_col_stride( b );
|
||||||
|
}
|
||||||
|
else // if ( bli_obj_has_trans( b ) )
|
||||||
|
{
|
||||||
|
// Assign the variables with an implicit transposition.
|
||||||
|
rs_b = bli_obj_col_stride( b );
|
||||||
|
cs_b = bli_obj_row_stride( b );
|
||||||
|
}
|
||||||
|
|
||||||
|
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||||
|
const inc_t rs_c = bli_obj_row_stride( c );
|
||||||
|
const inc_t cs_c = bli_obj_col_stride( c );
|
||||||
|
|
||||||
|
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||||
|
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Index into the type combination array to extract the correct
|
||||||
|
// function pointer.
|
||||||
|
FUNCPTR_T f = ftypes_var1n[dt_exec];
|
||||||
|
|
||||||
|
if ( bli_is_notrans( trans ) )
|
||||||
|
{
|
||||||
|
// Invoke the function.
|
||||||
|
f
|
||||||
|
(
|
||||||
|
conja,
|
||||||
|
conjb,
|
||||||
|
m,
|
||||||
|
n,
|
||||||
|
k,
|
||||||
|
buf_alpha,
|
||||||
|
buf_a, rs_a, cs_a,
|
||||||
|
buf_b, rs_b, cs_b,
|
||||||
|
buf_beta,
|
||||||
|
buf_c, rs_c, cs_c,
|
||||||
|
eff_id,
|
||||||
|
cntx,
|
||||||
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Invoke the function (transposing the operation).
|
||||||
|
f
|
||||||
|
(
|
||||||
|
conjb, // swap the conj values.
|
||||||
|
conja,
|
||||||
|
n, // swap the m and n dimensions.
|
||||||
|
m,
|
||||||
|
k,
|
||||||
|
buf_alpha,
|
||||||
|
buf_b, cs_b, rs_b, // swap the positions of A and B.
|
||||||
|
buf_a, cs_a, rs_a, // swap the strides of A and B.
|
||||||
|
buf_beta,
|
||||||
|
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||||
|
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||||
|
cntx,
|
||||||
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
conj_t conja, \
|
||||||
|
conj_t conjb, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t k, \
|
||||||
|
void* restrict alpha, \
|
||||||
|
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||||
|
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||||
|
void* restrict beta, \
|
||||||
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* If m or n is zero, return immediately. */ \
|
||||||
|
if ( bli_zero_dim2( m, n ) ) return; \
|
||||||
|
\
|
||||||
|
/* If k < 1 or alpha is zero, scale by beta and return. */ \
|
||||||
|
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
|
||||||
|
{ \
|
||||||
|
PASTEMAC(ch,scalm) \
|
||||||
|
( \
|
||||||
|
BLIS_NO_CONJUGATE, \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
m, n, \
|
||||||
|
beta, \
|
||||||
|
c, rs_c, cs_c \
|
||||||
|
); \
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
|
\
|
||||||
|
/* This transposition of the stor3_t id value is inherent to variant 1.
|
||||||
|
The reason: we assume that variant 2 is the "main" variant. The
|
||||||
|
consequence of this is that we assume that the millikernels that
|
||||||
|
iterate over m are registered to the kernel group associated with
|
||||||
|
the kernel preference. So, regardless of whether the mkernels are
|
||||||
|
row- or column-preferential, millikernels that iterate over n are
|
||||||
|
always placed in the slots for the opposite kernel group. */ \
|
||||||
|
stor_id = bli_stor3_trans( stor_id ); \
|
||||||
|
\
|
||||||
|
/* Query the context for various blocksizes. */ \
|
||||||
|
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||||
|
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||||
|
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||||
|
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||||
|
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||||
|
\
|
||||||
|
dim_t KC; \
|
||||||
|
if ( FALSE ) KC = KC0; \
|
||||||
|
else if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) KC = KC0; \
|
||||||
|
else if ( m <= MR && n <= NR ) KC = KC0; \
|
||||||
|
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
|
||||||
|
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
|
||||||
|
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
||||||
|
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||||
|
\
|
||||||
|
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
|
||||||
|
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
|
||||||
|
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
|
||||||
|
\
|
||||||
|
/* Query the maximum blocksize for MR, which implies a maximum blocksize
|
||||||
|
extension for the final iteration. */ \
|
||||||
|
const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
|
||||||
|
const dim_t MRE = MRM - MR; \
|
||||||
|
\
|
||||||
|
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||||
|
const inc_t jcstep_c = rs_c * NC; \
|
||||||
|
const inc_t jcstep_a = rs_a * NC; \
|
||||||
|
\
|
||||||
|
const inc_t pcstep_a = cs_a * KC; \
|
||||||
|
const inc_t pcstep_b = rs_b * KC; \
|
||||||
|
\
|
||||||
|
const inc_t icstep_c = cs_c * MC; \
|
||||||
|
const inc_t icstep_b = cs_b * MC; \
|
||||||
|
\
|
||||||
|
const inc_t jrstep_c = rs_c * MR; \
|
||||||
|
const inc_t jrstep_a = rs_a * MR; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
const inc_t irstep_c = cs_c * NR; \
|
||||||
|
const inc_t irstep_b = cs_b * NR; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Query the context for the sup microkernel address and cast it to its
|
||||||
|
function pointer type. */ \
|
||||||
|
PASTECH(ch,gemmsup_ker_ft) \
|
||||||
|
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_00 = a; \
|
||||||
|
ctype* restrict b_00 = b; \
|
||||||
|
ctype* restrict c_00 = c; \
|
||||||
|
ctype* restrict alpha_cast = alpha; \
|
||||||
|
ctype* restrict beta_cast = beta; \
|
||||||
|
\
|
||||||
|
ctype* restrict one = PASTEMAC(ch,1); \
|
||||||
|
\
|
||||||
|
auxinfo_t aux; \
|
||||||
|
\
|
||||||
|
/* Compute number of primary and leftover components of the outer
|
||||||
|
dimensions.
|
||||||
|
NOTE: Functionally speaking, we compute jc_iter as:
|
||||||
|
jc_iter = m / NC; if ( jc_left ) ++jc_iter;
|
||||||
|
However, this is implemented as:
|
||||||
|
jc_iter = ( m + NC - 1 ) / NC;
|
||||||
|
This avoids a branch at the cost of two additional integer instructions.
|
||||||
|
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
|
||||||
|
similar manner. */ \
|
||||||
|
const dim_t jc_iter = ( m + NC - 1 ) / NC; \
|
||||||
|
const dim_t jc_left = m % NC; \
|
||||||
|
\
|
||||||
|
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
|
||||||
|
const dim_t pc_left = k % KC; \
|
||||||
|
\
|
||||||
|
const dim_t ic_iter = ( n + MC - 1 ) / MC; \
|
||||||
|
const dim_t ic_left = n % MC; \
|
||||||
|
\
|
||||||
|
const dim_t jc_inc = 1; \
|
||||||
|
const dim_t pc_inc = 1; \
|
||||||
|
const dim_t ic_inc = 1; \
|
||||||
|
const dim_t jr_inc = 1; \
|
||||||
|
/*
|
||||||
|
const dim_t ir_inc = 1; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Loop over the m dimension (NC rows/columns at a time). */ \
|
||||||
|
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
|
||||||
|
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||||
|
\
|
||||||
|
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
|
||||||
|
dim_t jr_left = nc_cur % MR; \
|
||||||
|
\
|
||||||
|
/* An optimization: allow the last jr iteration to contain up to MRE
|
||||||
|
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
|
||||||
|
these cases.) Note that this prevents us from declaring jr_iter and
|
||||||
|
jr_left as const. */ \
|
||||||
|
if ( 1 ) \
|
||||||
|
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
|
||||||
|
{ \
|
||||||
|
jr_iter--; jr_left += MR; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||||
|
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
|
||||||
|
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
|
||||||
|
\
|
||||||
|
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||||
|
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||||
|
\
|
||||||
|
/* Loop over the n dimension (MC rows at a time). */ \
|
||||||
|
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict b_ic = b_pc + ii * icstep_b; \
|
||||||
|
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
|
||||||
|
const dim_t ir_left = mc_cur % NR; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Loop over the m dimension (NR columns at a time). */ \
|
||||||
|
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_jr = a_pc + j * jrstep_a; \
|
||||||
|
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||||
|
\
|
||||||
|
/* Loop over the n dimension (MR rows at a time). */ \
|
||||||
|
{ \
|
||||||
|
/* Invoke the gemmsup millikernel. */ \
|
||||||
|
gemmsup_ker \
|
||||||
|
( \
|
||||||
|
conja, \
|
||||||
|
conjb, \
|
||||||
|
nr_cur, /* Notice: nr_cur <= MR. */ \
|
||||||
|
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
|
||||||
|
kc_cur, \
|
||||||
|
alpha_cast, \
|
||||||
|
a_jr, rs_a, cs_a, \
|
||||||
|
b_ic, rs_b, cs_b, \
|
||||||
|
beta_use, \
|
||||||
|
c_jr, rs_c, cs_c, \
|
||||||
|
&aux, \
|
||||||
|
cntx \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
*/ \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- var2m --------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
|
static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
|
||||||
|
|
||||||
|
void bli_gemmsup_ref_var2m
|
||||||
|
(
|
||||||
|
trans_t trans,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
stor3_t eff_id,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
obj_t at, bt;
|
||||||
|
|
||||||
|
bli_obj_alias_to( a, &at );
|
||||||
|
bli_obj_alias_to( b, &bt );
|
||||||
|
|
||||||
|
// Induce transpositions on A and/or B if either object is marked for
|
||||||
|
// transposition. We can induce "fast" transpositions since they objects
|
||||||
|
// are guaranteed to not have structure or be packed.
|
||||||
|
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
|
||||||
|
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
|
||||||
|
|
||||||
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
|
const dim_t m = bli_obj_length( c );
|
||||||
|
const dim_t n = bli_obj_width( c );
|
||||||
|
|
||||||
|
const dim_t k = bli_obj_width( &at );
|
||||||
|
|
||||||
|
void* restrict buf_a = bli_obj_buffer_at_off( &at );
|
||||||
|
const inc_t rs_a = bli_obj_row_stride( &at );
|
||||||
|
const inc_t cs_a = bli_obj_col_stride( &at );
|
||||||
|
|
||||||
|
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
|
||||||
|
const inc_t rs_b = bli_obj_row_stride( &bt );
|
||||||
|
const inc_t cs_b = bli_obj_col_stride( &bt );
|
||||||
|
|
||||||
|
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||||
|
const inc_t rs_c = bli_obj_row_stride( c );
|
||||||
|
const inc_t cs_c = bli_obj_col_stride( c );
|
||||||
|
|
||||||
|
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||||
|
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||||
|
|
||||||
|
#else
|
||||||
|
const num_t dt_exec = bli_obj_dt( c );
|
||||||
|
|
||||||
|
const conj_t conja = bli_obj_conj_status( a );
|
||||||
|
const conj_t conjb = bli_obj_conj_status( b );
|
||||||
|
|
||||||
|
const dim_t m = bli_obj_length( c );
|
||||||
|
const dim_t n = bli_obj_width( c );
|
||||||
|
dim_t k;
|
||||||
|
|
||||||
|
void* restrict buf_a = bli_obj_buffer_at_off( a );
|
||||||
|
inc_t rs_a;
|
||||||
|
inc_t cs_a;
|
||||||
|
|
||||||
|
void* restrict buf_b = bli_obj_buffer_at_off( b );
|
||||||
|
inc_t rs_b;
|
||||||
|
inc_t cs_b;
|
||||||
|
|
||||||
|
if ( bli_obj_has_notrans( a ) )
|
||||||
|
{
|
||||||
|
k = bli_obj_width( a );
|
||||||
|
|
||||||
|
rs_a = bli_obj_row_stride( a );
|
||||||
|
cs_a = bli_obj_col_stride( a );
|
||||||
|
}
|
||||||
|
else // if ( bli_obj_has_trans( a ) )
|
||||||
|
{
|
||||||
|
// Assign the variables with an implicit transposition.
|
||||||
|
k = bli_obj_length( a );
|
||||||
|
|
||||||
|
rs_a = bli_obj_col_stride( a );
|
||||||
|
cs_a = bli_obj_row_stride( a );
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( bli_obj_has_notrans( b ) )
|
||||||
|
{
|
||||||
|
rs_b = bli_obj_row_stride( b );
|
||||||
|
cs_b = bli_obj_col_stride( b );
|
||||||
|
}
|
||||||
|
else // if ( bli_obj_has_trans( b ) )
|
||||||
|
{
|
||||||
|
// Assign the variables with an implicit transposition.
|
||||||
|
rs_b = bli_obj_col_stride( b );
|
||||||
|
cs_b = bli_obj_row_stride( b );
|
||||||
|
}
|
||||||
|
|
||||||
|
void* restrict buf_c = bli_obj_buffer_at_off( c );
|
||||||
|
const inc_t rs_c = bli_obj_row_stride( c );
|
||||||
|
const inc_t cs_c = bli_obj_col_stride( c );
|
||||||
|
|
||||||
|
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
|
||||||
|
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Index into the type combination array to extract the correct
|
||||||
|
// function pointer.
|
||||||
|
FUNCPTR_T f = ftypes_var2m[dt_exec];
|
||||||
|
|
||||||
|
if ( bli_is_notrans( trans ) )
|
||||||
|
{
|
||||||
|
// Invoke the function.
|
||||||
|
f
|
||||||
|
(
|
||||||
|
conja,
|
||||||
|
conjb,
|
||||||
|
m,
|
||||||
|
n,
|
||||||
|
k,
|
||||||
|
buf_alpha,
|
||||||
|
buf_a, rs_a, cs_a,
|
||||||
|
buf_b, rs_b, cs_b,
|
||||||
|
buf_beta,
|
||||||
|
buf_c, rs_c, cs_c,
|
||||||
|
eff_id,
|
||||||
|
cntx,
|
||||||
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Invoke the function (transposing the operation).
|
||||||
|
f
|
||||||
|
(
|
||||||
|
conjb, // swap the conj values.
|
||||||
|
conja,
|
||||||
|
n, // swap the m and n dimensions.
|
||||||
|
m,
|
||||||
|
k,
|
||||||
|
buf_alpha,
|
||||||
|
buf_b, cs_b, rs_b, // swap the positions of A and B.
|
||||||
|
buf_a, cs_a, rs_a, // swap the strides of A and B.
|
||||||
|
buf_beta,
|
||||||
|
buf_c, cs_c, rs_c, // swap the strides of C.
|
||||||
|
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
|
||||||
|
cntx,
|
||||||
|
rntm,
|
||||||
|
cntl,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef GENTFUNC
|
||||||
|
#define GENTFUNC( ctype, ch, varname ) \
|
||||||
|
\
|
||||||
|
void PASTEMAC(ch,varname) \
|
||||||
|
( \
|
||||||
|
conj_t conja, \
|
||||||
|
conj_t conjb, \
|
||||||
|
dim_t m, \
|
||||||
|
dim_t n, \
|
||||||
|
dim_t k, \
|
||||||
|
void* restrict alpha, \
|
||||||
|
void* restrict a, inc_t rs_a, inc_t cs_a, \
|
||||||
|
void* restrict b, inc_t rs_b, inc_t cs_b, \
|
||||||
|
void* restrict beta, \
|
||||||
|
void* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||||
|
stor3_t stor_id, \
|
||||||
|
cntx_t* restrict cntx, \
|
||||||
|
rntm_t* restrict rntm, \
|
||||||
|
cntl_t* restrict cntl, \
|
||||||
|
thrinfo_t* restrict thread \
|
||||||
|
) \
|
||||||
|
{ \
|
||||||
|
/* If m or n is zero, return immediately. */ \
|
||||||
|
if ( bli_zero_dim2( m, n ) ) return; \
|
||||||
|
\
|
||||||
|
/* If k < 1 or alpha is zero, scale by beta and return. */ \
|
||||||
|
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
|
||||||
|
{ \
|
||||||
|
PASTEMAC(ch,scalm) \
|
||||||
|
( \
|
||||||
|
BLIS_NO_CONJUGATE, \
|
||||||
|
0, \
|
||||||
|
BLIS_NONUNIT_DIAG, \
|
||||||
|
BLIS_DENSE, \
|
||||||
|
m, n, \
|
||||||
|
beta, \
|
||||||
|
c, rs_c, cs_c \
|
||||||
|
); \
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
const num_t dt = PASTEMAC(ch,type); \
|
||||||
|
\
|
||||||
|
/* Query the context for various blocksizes. */ \
|
||||||
|
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
|
||||||
|
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
|
||||||
|
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
|
||||||
|
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
|
||||||
|
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
|
||||||
|
\
|
||||||
|
dim_t KC; \
|
||||||
|
if ( stor_id == BLIS_RRR || \
|
||||||
|
stor_id == BLIS_CCC ) KC = KC0; \
|
||||||
|
else if ( stor_id == BLIS_RRC || \
|
||||||
|
stor_id == BLIS_CRC ) KC = KC0; \
|
||||||
|
else if ( m <= MR && n <= NR ) KC = KC0; \
|
||||||
|
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
|
||||||
|
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
|
||||||
|
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
|
||||||
|
else KC = (( KC0 / 5 ) / 4 ) * 4; \
|
||||||
|
\
|
||||||
|
/* Query the maximum blocksize for NR, which implies a maximum blocksize
|
||||||
|
extension for the final iteration. */ \
|
||||||
|
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
|
||||||
|
const dim_t NRE = NRM - NR; \
|
||||||
|
\
|
||||||
|
/* Compute partitioning step values for each matrix of each loop. */ \
|
||||||
|
const inc_t jcstep_c = cs_c * NC; \
|
||||||
|
const inc_t jcstep_b = cs_b * NC; \
|
||||||
|
\
|
||||||
|
const inc_t pcstep_a = cs_a * KC; \
|
||||||
|
const inc_t pcstep_b = rs_b * KC; \
|
||||||
|
\
|
||||||
|
const inc_t icstep_c = rs_c * MC; \
|
||||||
|
const inc_t icstep_a = rs_a * MC; \
|
||||||
|
\
|
||||||
|
const inc_t jrstep_c = cs_c * NR; \
|
||||||
|
const inc_t jrstep_b = cs_b * NR; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
const inc_t irstep_c = rs_c * MR; \
|
||||||
|
const inc_t irstep_a = rs_a * MR; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Query the context for the sup microkernel address and cast it to its
|
||||||
|
function pointer type. */ \
|
||||||
|
PASTECH(ch,gemmsup_ker_ft) \
|
||||||
|
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_00 = a; \
|
||||||
|
ctype* restrict b_00 = b; \
|
||||||
|
ctype* restrict c_00 = c; \
|
||||||
|
ctype* restrict alpha_cast = alpha; \
|
||||||
|
ctype* restrict beta_cast = beta; \
|
||||||
|
\
|
||||||
|
ctype* restrict one = PASTEMAC(ch,1); \
|
||||||
|
\
|
||||||
|
auxinfo_t aux; \
|
||||||
|
\
|
||||||
|
/* Compute number of primary and leftover components of the outer
|
||||||
|
dimensions.
|
||||||
|
NOTE: Functionally speaking, we compute jc_iter as:
|
||||||
|
jc_iter = n / NC; if ( jc_left ) ++jc_iter;
|
||||||
|
However, this is implemented as:
|
||||||
|
jc_iter = ( n + NC - 1 ) / NC;
|
||||||
|
This avoids a branch at the cost of two additional integer instructions.
|
||||||
|
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
|
||||||
|
similar manner. */ \
|
||||||
|
const dim_t jc_iter = ( n + NC - 1 ) / NC; \
|
||||||
|
const dim_t jc_left = n % NC; \
|
||||||
|
\
|
||||||
|
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
|
||||||
|
const dim_t pc_left = k % KC; \
|
||||||
|
\
|
||||||
|
const dim_t ic_iter = ( m + MC - 1 ) / MC; \
|
||||||
|
const dim_t ic_left = m % MC; \
|
||||||
|
\
|
||||||
|
const dim_t jc_inc = 1; \
|
||||||
|
const dim_t pc_inc = 1; \
|
||||||
|
const dim_t ic_inc = 1; \
|
||||||
|
const dim_t jr_inc = 1; \
|
||||||
|
/*
|
||||||
|
const dim_t ir_inc = 1; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Loop over the n dimension (NC rows/columns at a time). */ \
|
||||||
|
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
|
||||||
|
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
|
||||||
|
\
|
||||||
|
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
|
||||||
|
dim_t jr_left = nc_cur % NR; \
|
||||||
|
\
|
||||||
|
/* An optimization: allow the last jr iteration to contain up to NRE
|
||||||
|
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
|
||||||
|
these cases.) Note that this prevents us from declaring jr_iter and
|
||||||
|
jr_left as const. */ \
|
||||||
|
if ( 1 ) \
|
||||||
|
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
|
||||||
|
{ \
|
||||||
|
jr_iter--; jr_left += NR; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/* Loop over the k dimension (KC rows/columns at a time). */ \
|
||||||
|
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
|
||||||
|
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
|
||||||
|
\
|
||||||
|
/* Only apply beta to the first iteration of the pc loop. */ \
|
||||||
|
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
|
||||||
|
\
|
||||||
|
/* Loop over the m dimension (MC rows at a time). */ \
|
||||||
|
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict a_ic = a_pc + ii * icstep_a; \
|
||||||
|
ctype* restrict c_ic = c_jc + ii * icstep_c; \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
|
||||||
|
const dim_t ir_left = mc_cur % MR; \
|
||||||
|
*/ \
|
||||||
|
\
|
||||||
|
/* Loop over the n dimension (NR columns at a time). */ \
|
||||||
|
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
|
||||||
|
{ \
|
||||||
|
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
|
||||||
|
\
|
||||||
|
ctype* restrict b_jr = b_pc + j * jrstep_b; \
|
||||||
|
ctype* restrict c_jr = c_ic + j * jrstep_c; \
|
||||||
|
\
|
||||||
|
/* Loop over the m dimension (MR rows at a time). */ \
|
||||||
|
{ \
|
||||||
|
/* Invoke the gemmsup millikernel. */ \
|
||||||
|
gemmsup_ker \
|
||||||
|
( \
|
||||||
|
conja, \
|
||||||
|
conjb, \
|
||||||
|
mc_cur, \
|
||||||
|
nr_cur, \
|
||||||
|
kc_cur, \
|
||||||
|
alpha_cast, \
|
||||||
|
a_ic, rs_a, cs_a, \
|
||||||
|
b_jr, rs_b, cs_b, \
|
||||||
|
beta_use, \
|
||||||
|
c_jr, rs_c, cs_c, \
|
||||||
|
&aux, \
|
||||||
|
cntx \
|
||||||
|
); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
/*
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
|
||||||
|
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
|
||||||
|
*/ \
|
||||||
|
}
|
||||||
|
|
||||||
|
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
|
||||||
|
|
||||||
@@ -65,6 +65,15 @@ static inc_t bli_auxinfo_is_b( auxinfo_t* ai )
|
|||||||
return ai->is_b;
|
return ai->is_b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inc_t bli_auxinfo_ps_a( auxinfo_t* ai )
|
||||||
|
{
|
||||||
|
return ai->ps_a;
|
||||||
|
}
|
||||||
|
static inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
|
||||||
|
{
|
||||||
|
return ai->ps_b;
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai )
|
static inc_t bli_auxinfo_dt_on_output( auxinfo_t* ai )
|
||||||
{
|
{
|
||||||
@@ -107,6 +116,15 @@ static void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai )
|
|||||||
ai->is_b = is;
|
ai->is_b = is;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai )
|
||||||
|
{
|
||||||
|
ai->ps_a = ps;
|
||||||
|
}
|
||||||
|
static void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
|
||||||
|
{
|
||||||
|
ai->ps_b = ps;
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai )
|
static void bli_auxinfo_set_dt_on_output( num_t dt_on_output, auxinfo_t* ai )
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -956,8 +956,7 @@ void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
|
|||||||
// Process each operation id tuple provided.
|
// Process each operation id tuple provided.
|
||||||
for ( i = 0; i < n_ops; ++i )
|
for ( i = 0; i < n_ops; ++i )
|
||||||
{
|
{
|
||||||
// Read the current ukernel id, ukernel datatype, and ukernel function
|
// Read the current operation id and handler function pointer.
|
||||||
// pointer.
|
|
||||||
const opid_t op_id = op_ids[ i ];
|
const opid_t op_id = op_ids[ i ];
|
||||||
void* op_fp = op_fps[ i ];
|
void* op_fp = op_fps[ i ];
|
||||||
|
|
||||||
|
|||||||
95
frame/base/bli_env.c
Normal file
95
frame/base/bli_env.c
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
dim_t bli_env_get_var( const char* env, dim_t fallback )
|
||||||
|
{
|
||||||
|
dim_t r_val;
|
||||||
|
char* str;
|
||||||
|
|
||||||
|
// Query the environment variable and store the result in str.
|
||||||
|
str = getenv( env );
|
||||||
|
|
||||||
|
// Set the return value based on the string obtained from getenv().
|
||||||
|
if ( str != NULL )
|
||||||
|
{
|
||||||
|
// If there was no error, convert the string to an integer and
|
||||||
|
// prepare to return that integer.
|
||||||
|
r_val = strtol( str, NULL, 10 );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// If there was an error, use the "fallback" as the return value.
|
||||||
|
r_val = fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
return r_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
void bli_env_set_var( const char* env, dim_t value )
|
||||||
|
{
|
||||||
|
dim_t r_val;
|
||||||
|
char value_str[32];
|
||||||
|
const char* fs_32 = "%u";
|
||||||
|
const char* fs_64 = "%lu";
|
||||||
|
|
||||||
|
// Convert the string to an integer, but vary the format specifier
|
||||||
|
// depending on the integer type size.
|
||||||
|
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
|
||||||
|
else sprintf( value_str, fs_64, value );
|
||||||
|
|
||||||
|
// Set the environment variable using the string we just wrote to via
|
||||||
|
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
|
||||||
|
// value if the environment variable already exists.)
|
||||||
|
r_val = bli_setenv( env, value_str, TRUE );
|
||||||
|
|
||||||
|
// Check the return value in case something went horribly wrong.
|
||||||
|
if ( r_val == -1 )
|
||||||
|
{
|
||||||
|
char err_str[128];
|
||||||
|
|
||||||
|
// Query the human-readable error string corresponding to errno.
|
||||||
|
strerror_r( errno, err_str, 128 );
|
||||||
|
|
||||||
|
// Print the error message.
|
||||||
|
bli_print_msg( err_str, __FILE__, __LINE__ );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
44
frame/base/bli_env.h
Normal file
44
frame/base/bli_env.h
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_ENV_H
|
||||||
|
#define BLIS_ENV_H
|
||||||
|
|
||||||
|
dim_t bli_env_get_var( const char* env, dim_t fallback );
|
||||||
|
//void bli_env_set_var( const char* env, dim_t value );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
@@ -78,6 +78,7 @@ void bli_init_apis( void )
|
|||||||
bli_gks_init();
|
bli_gks_init();
|
||||||
bli_ind_init();
|
bli_ind_init();
|
||||||
bli_thread_init();
|
bli_thread_init();
|
||||||
|
bli_pack_init();
|
||||||
bli_memsys_init();
|
bli_memsys_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -85,9 +86,10 @@ void bli_finalize_apis( void )
|
|||||||
{
|
{
|
||||||
// Finalize various sub-APIs.
|
// Finalize various sub-APIs.
|
||||||
bli_memsys_finalize();
|
bli_memsys_finalize();
|
||||||
|
bli_pack_finalize();
|
||||||
bli_thread_finalize();
|
bli_thread_finalize();
|
||||||
bli_gks_finalize();
|
|
||||||
bli_ind_finalize();
|
bli_ind_finalize();
|
||||||
|
bli_gks_finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -34,11 +34,32 @@
|
|||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#ifndef BLIS_MEM_H
|
#ifndef BLIS_MEM_H
|
||||||
#define BLIS_MEM_H
|
#define BLIS_MEM_H
|
||||||
|
|
||||||
|
|
||||||
// Mem entry query
|
// mem_t object type (defined in bli_type_defs.h)
|
||||||
|
|
||||||
|
/*
|
||||||
|
typedef struct mem_s
|
||||||
|
{
|
||||||
|
pblk_t pblk;
|
||||||
|
packbuf_t buf_type;
|
||||||
|
pool_t* pool;
|
||||||
|
siz_t size;
|
||||||
|
} mem_t;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
void* buf;
|
||||||
|
siz_t block_size;
|
||||||
|
} pblk_t;
|
||||||
|
*/
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- mem_t query --------------------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
static pblk_t* bli_mem_pblk( mem_t* mem )
|
static pblk_t* bli_mem_pblk( mem_t* mem )
|
||||||
{
|
{
|
||||||
@@ -78,7 +99,9 @@ static bool_t bli_mem_is_unalloc( mem_t* mem )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Mem entry modification
|
//
|
||||||
|
// -- mem_t modification -------------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
static void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
|
static void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
|
||||||
{
|
{
|
||||||
@@ -105,9 +128,26 @@ static void bli_mem_set_size( siz_t size, mem_t* mem )
|
|||||||
mem->size = size;
|
mem->size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- mem_t initialization -----------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
|
// NOTE: This initializer macro must be updated whenever fields are added or
|
||||||
|
// removed from the mem_t type definition. An alternative to the initializer is
|
||||||
|
// calling bli_mem_clear() at runtime.
|
||||||
|
|
||||||
|
#define BLIS_MEM_INITIALIZER \
|
||||||
|
{ \
|
||||||
|
.pblk = BLIS_PBLK_INITIALIZER, \
|
||||||
|
.buf_type = -1, \
|
||||||
|
.pool = NULL, \
|
||||||
|
.size = 0, \
|
||||||
|
} \
|
||||||
|
|
||||||
static void bli_mem_clear( mem_t* mem )
|
static void bli_mem_clear( mem_t* mem )
|
||||||
{
|
{
|
||||||
bli_mem_set_buffer( NULL, mem );
|
bli_mem_set_buffer( NULL, mem );
|
||||||
|
bli_mem_set_buf_type( -1, mem );
|
||||||
bli_mem_set_pool( NULL, mem );
|
bli_mem_set_pool( NULL, mem );
|
||||||
bli_mem_set_size( 0, mem );
|
bli_mem_set_size( 0, mem );
|
||||||
}
|
}
|
||||||
|
|||||||
157
frame/base/bli_pack.c
Normal file
157
frame/base/bli_pack.c
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
|
||||||
|
extern rntm_t global_rntm;
|
||||||
|
|
||||||
|
// A mutex to allow synchronous access to global_rntm. (The definition
|
||||||
|
// resides in bli_rntm.c.)
|
||||||
|
extern bli_pthread_mutex_t global_rntm_mutex;
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_pack_init( void )
|
||||||
|
{
|
||||||
|
// Read the environment variables and use them to initialize the
|
||||||
|
// global runtime object.
|
||||||
|
bli_pack_init_rntm_from_env( &global_rntm );
|
||||||
|
}
|
||||||
|
|
||||||
|
void bli_pack_finalize( void )
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
dim_t bli_pack_get_pack_a( void )
|
||||||
|
{
|
||||||
|
// We must ensure that global_rntm has been initialized.
|
||||||
|
bli_init_once();
|
||||||
|
|
||||||
|
return bli_rntm_pack_a( &global_rntm );
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
dim_t bli_pack_get_pack_b( void )
|
||||||
|
{
|
||||||
|
// We must ensure that global_rntm has been initialized.
|
||||||
|
bli_init_once();
|
||||||
|
|
||||||
|
return bli_rntm_pack_b( &global_rntm );
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_pack_set_pack_a( bool_t pack_a )
|
||||||
|
{
|
||||||
|
// We must ensure that global_rntm has been initialized.
|
||||||
|
bli_init_once();
|
||||||
|
|
||||||
|
// Acquire the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||||
|
|
||||||
|
bli_rntm_set_pack_a( pack_a, &global_rntm );
|
||||||
|
|
||||||
|
// Release the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_pack_set_pack_b( bool_t pack_b )
|
||||||
|
{
|
||||||
|
// We must ensure that global_rntm has been initialized.
|
||||||
|
bli_init_once();
|
||||||
|
|
||||||
|
// Acquire the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||||
|
|
||||||
|
bli_rntm_set_pack_a( pack_b, &global_rntm );
|
||||||
|
|
||||||
|
// Release the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_pack_init_rntm_from_env
|
||||||
|
(
|
||||||
|
rntm_t* rntm
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// NOTE: We don't need to acquire the global_rntm_mutex here because this
|
||||||
|
// function is only called from bli_pack_init(), which is only called
|
||||||
|
// by bli_init_once().
|
||||||
|
|
||||||
|
bool_t pack_a;
|
||||||
|
bool_t pack_b;
|
||||||
|
|
||||||
|
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
|
||||||
|
|
||||||
|
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
|
||||||
|
// -1 if it is unset.
|
||||||
|
pack_a = bli_env_get_var( "BLIS_PACK_A", -1 );
|
||||||
|
pack_b = bli_env_get_var( "BLIS_PACK_B", -1 );
|
||||||
|
|
||||||
|
// Enforce the default behavior first, then check for affirmative FALSE, and
|
||||||
|
// finally assume anything else is TRUE.
|
||||||
|
if ( pack_a == -1 ) pack_a = FALSE; // default behavior
|
||||||
|
else if ( pack_a == 0 ) pack_a = FALSE; // zero is FALSE
|
||||||
|
else pack_a = TRUE; // anything else is TRUE
|
||||||
|
|
||||||
|
if ( pack_b == -1 ) pack_b = FALSE; // default behavior
|
||||||
|
else if ( pack_b == 0 ) pack_b = FALSE; // zero is FALSE
|
||||||
|
else pack_b = TRUE; // anything else is TRUE
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
pack_a = TRUE;
|
||||||
|
pack_b = TRUE;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Save the results back in the runtime object.
|
||||||
|
bli_rntm_set_pack_a( pack_a, rntm );
|
||||||
|
bli_rntm_set_pack_b( pack_b, rntm );
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
printf( "bli_pack_init_rntm_from_env()\n" );
|
||||||
|
bli_rntm_print( rntm );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
49
frame/base/bli_pack.h
Normal file
49
frame/base/bli_pack.h
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_PACK_H
|
||||||
|
#define BLIS_PACK_H
|
||||||
|
|
||||||
|
void bli_pack_init( void );
|
||||||
|
void bli_pack_finalize( void );
|
||||||
|
|
||||||
|
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_a( void );
|
||||||
|
BLIS_EXPORT_BLIS dim_t bli_pack_get_pack_b( void );
|
||||||
|
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool_t pack_a );
|
||||||
|
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool_t pack_b );
|
||||||
|
|
||||||
|
void bli_pack_init_rntm_from_env( rntm_t* rntm );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
@@ -92,6 +92,20 @@ static void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk )
|
|||||||
pblk->block_size = block_size;
|
pblk->block_size = block_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- pool block initialization ------------------------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
|
// NOTE: This initializer macro must be updated whenever fields are added or
|
||||||
|
// removed from the pblk_t type definition. An alternative to the initializer is
|
||||||
|
// calling bli_pblk_clear() at runtime.
|
||||||
|
|
||||||
|
#define BLIS_PBLK_INITIALIZER \
|
||||||
|
{ \
|
||||||
|
.buf = NULL, \
|
||||||
|
.block_size = 0, \
|
||||||
|
} \
|
||||||
|
|
||||||
static void bli_pblk_clear( pblk_t* pblk )
|
static void bli_pblk_clear( pblk_t* pblk )
|
||||||
{
|
{
|
||||||
bli_pblk_set_buf( NULL, pblk );
|
bli_pblk_set_buf( NULL, pblk );
|
||||||
|
|||||||
@@ -34,6 +34,29 @@
|
|||||||
|
|
||||||
#include "blis.h"
|
#include "blis.h"
|
||||||
|
|
||||||
|
// The global rntm_t structure, which holds the global thread settings
|
||||||
|
// along with a few other key parameters.
|
||||||
|
rntm_t global_rntm;
|
||||||
|
|
||||||
|
// A mutex to allow synchronous access to global_rntm.
|
||||||
|
bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_rntm_init_from_global( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
// We must ensure that global_rntm has been initialized.
|
||||||
|
bli_init_once();
|
||||||
|
|
||||||
|
// Acquire the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_lock( &global_rntm_mutex );
|
||||||
|
|
||||||
|
*rntm = global_rntm;
|
||||||
|
|
||||||
|
// Release the mutex protecting global_rntm.
|
||||||
|
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
||||||
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
void bli_rntm_set_ways_for_op
|
void bli_rntm_set_ways_for_op
|
||||||
@@ -152,7 +175,7 @@ void bli_rntm_set_ways_from_rntm
|
|||||||
bool_t ways_set = FALSE;
|
bool_t ways_set = FALSE;
|
||||||
|
|
||||||
// If the rntm was fed in as a copy of the global runtime via
|
// If the rntm was fed in as a copy of the global runtime via
|
||||||
// bli_thread_init_rntm(), we know that either the num_threads
|
// bli_rntm_init_from_global(), we know that either the num_threads
|
||||||
// field will be set and all of the ways unset, or vice versa.
|
// field will be set and all of the ways unset, or vice versa.
|
||||||
// However, we can't be sure that a user-provided rntm_t isn't
|
// However, we can't be sure that a user-provided rntm_t isn't
|
||||||
// initialized uncleanly. So here we have to enforce some rules
|
// initialized uncleanly. So here we have to enforce some rules
|
||||||
|
|||||||
@@ -45,13 +45,13 @@ typedef struct rntm_s
|
|||||||
{
|
{
|
||||||
dim_t num_threads;
|
dim_t num_threads;
|
||||||
dim_t* thrloop;
|
dim_t* thrloop;
|
||||||
|
dim_t pack_a;
|
||||||
|
dim_t pack_b;
|
||||||
|
bool_t l3_sup;
|
||||||
|
|
||||||
pool_t* sba_pool;
|
pool_t* sba_pool;
|
||||||
|
|
||||||
membrk_t* membrk;
|
membrk_t* membrk;
|
||||||
|
|
||||||
bool_t l3_sup;
|
|
||||||
|
|
||||||
} rntm_t;
|
} rntm_t;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -94,6 +94,15 @@ static dim_t bli_rntm_pr_ways( rntm_t* rntm )
|
|||||||
return bli_rntm_ways_for( BLIS_KR, rntm );
|
return bli_rntm_ways_for( BLIS_KR, rntm );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool_t bli_rntm_pack_a( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
return rntm->pack_a;
|
||||||
|
}
|
||||||
|
static bool_t bli_rntm_pack_b( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
return rntm->pack_b;
|
||||||
|
}
|
||||||
|
|
||||||
static bool_t bli_rntm_l3_sup( rntm_t* rntm )
|
static bool_t bli_rntm_l3_sup( rntm_t* rntm )
|
||||||
{
|
{
|
||||||
return rntm->l3_sup;
|
return rntm->l3_sup;
|
||||||
@@ -187,22 +196,6 @@ static void bli_rntm_set_membrk( membrk_t* membrk, rntm_t* rntm )
|
|||||||
rntm->membrk = membrk;
|
rntm->membrk = membrk;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bli_rntm_set_l3_sup( bool_t l3_sup, rntm_t* rntm )
|
|
||||||
{
|
|
||||||
// Set the bool_t indicating whether level-3 sup handling is enabled.
|
|
||||||
rntm->l3_sup = l3_sup;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void bli_rntm_enable_l3_sup( rntm_t* rntm )
|
|
||||||
{
|
|
||||||
bli_rntm_set_l3_sup( TRUE, rntm );
|
|
||||||
}
|
|
||||||
|
|
||||||
static void bli_rntm_disable_l3_sup( rntm_t* rntm )
|
|
||||||
{
|
|
||||||
bli_rntm_set_l3_sup( FALSE, rntm );
|
|
||||||
}
|
|
||||||
|
|
||||||
static void bli_rntm_clear_num_threads_only( rntm_t* rntm )
|
static void bli_rntm_clear_num_threads_only( rntm_t* rntm )
|
||||||
{
|
{
|
||||||
bli_rntm_set_num_threads_only( -1, rntm );
|
bli_rntm_set_num_threads_only( -1, rntm );
|
||||||
@@ -219,10 +212,6 @@ static void bli_rntm_clear_membrk( rntm_t* rntm )
|
|||||||
{
|
{
|
||||||
bli_rntm_set_membrk( NULL, rntm );
|
bli_rntm_set_membrk( NULL, rntm );
|
||||||
}
|
}
|
||||||
static void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
|
||||||
{
|
|
||||||
bli_rntm_set_l3_sup( 1, rntm );
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// -- rntm_t modification (public API) -----------------------------------------
|
// -- rntm_t modification (public API) -----------------------------------------
|
||||||
@@ -251,6 +240,48 @@ static void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir,
|
|||||||
bli_rntm_clear_num_threads_only( rntm );
|
bli_rntm_clear_num_threads_only( rntm );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void bli_rntm_set_pack_a( bool_t pack_a, rntm_t* rntm )
|
||||||
|
{
|
||||||
|
// Set the bool_t indicating whether matrix A should be packed.
|
||||||
|
rntm->pack_a = pack_a;
|
||||||
|
}
|
||||||
|
static void bli_rntm_set_pack_b( bool_t pack_b, rntm_t* rntm )
|
||||||
|
{
|
||||||
|
// Set the bool_t indicating whether matrix B should be packed.
|
||||||
|
rntm->pack_b = pack_b;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bli_rntm_set_l3_sup( bool_t l3_sup, rntm_t* rntm )
|
||||||
|
{
|
||||||
|
// Set the bool_t indicating whether level-3 sup handling is enabled.
|
||||||
|
rntm->l3_sup = l3_sup;
|
||||||
|
}
|
||||||
|
static void bli_rntm_enable_l3_sup( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
bli_rntm_set_l3_sup( TRUE, rntm );
|
||||||
|
}
|
||||||
|
static void bli_rntm_disable_l3_sup( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
bli_rntm_set_l3_sup( FALSE, rntm );
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// -- rntm_t modification (internal use only) ----------------------------------
|
||||||
|
//
|
||||||
|
|
||||||
|
static void bli_rntm_clear_pack_a( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
bli_rntm_set_pack_a( TRUE, rntm );
|
||||||
|
}
|
||||||
|
static void bli_rntm_clear_pack_b( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
bli_rntm_set_pack_b( TRUE, rntm );
|
||||||
|
}
|
||||||
|
static void bli_rntm_clear_l3_sup( rntm_t* rntm )
|
||||||
|
{
|
||||||
|
bli_rntm_set_l3_sup( TRUE, rntm );
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// -- rntm_t initialization ----------------------------------------------------
|
// -- rntm_t initialization ----------------------------------------------------
|
||||||
//
|
//
|
||||||
@@ -263,26 +294,31 @@ static void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir,
|
|||||||
{ \
|
{ \
|
||||||
.num_threads = -1, \
|
.num_threads = -1, \
|
||||||
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
|
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
|
||||||
|
.pack_a = TRUE, \
|
||||||
|
.pack_b = TRUE, \
|
||||||
|
.l3_sup = TRUE \
|
||||||
.sba_pool = NULL, \
|
.sba_pool = NULL, \
|
||||||
.membrk = NULL, \
|
.membrk = NULL, \
|
||||||
.l3_sup = 1 \
|
|
||||||
} \
|
} \
|
||||||
|
|
||||||
static void bli_rntm_init( rntm_t* rntm )
|
static void bli_rntm_init( rntm_t* rntm )
|
||||||
{
|
{
|
||||||
bli_rntm_clear_num_threads_only( rntm );
|
bli_rntm_clear_num_threads_only( rntm );
|
||||||
bli_rntm_clear_ways_only( rntm );
|
bli_rntm_clear_ways_only( rntm );
|
||||||
|
bli_rntm_clear_pack_a( rntm );
|
||||||
|
bli_rntm_clear_pack_b( rntm );
|
||||||
|
bli_rntm_clear_l3_sup( rntm );
|
||||||
|
|
||||||
bli_rntm_clear_sba_pool( rntm );
|
bli_rntm_clear_sba_pool( rntm );
|
||||||
bli_rntm_clear_membrk( rntm );
|
bli_rntm_clear_membrk( rntm );
|
||||||
|
|
||||||
bli_rntm_clear_l3_sup( rntm );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
// Function prototypes
|
// Function prototypes
|
||||||
|
|
||||||
|
BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
|
||||||
|
|
||||||
BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
|
BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
|
||||||
(
|
(
|
||||||
opid_t l3_op,
|
opid_t l3_op,
|
||||||
|
|||||||
@@ -1185,6 +1185,13 @@ typedef struct
|
|||||||
inc_t is_a;
|
inc_t is_a;
|
||||||
inc_t is_b;
|
inc_t is_b;
|
||||||
|
|
||||||
|
// The panel strides of A and B.
|
||||||
|
// NOTE: These are only used in situations where iteration over the
|
||||||
|
// micropanels takes place in part within the kernel code (e.g. sup
|
||||||
|
// millikernels).
|
||||||
|
inc_t ps_a;
|
||||||
|
inc_t ps_b;
|
||||||
|
|
||||||
// The type to convert to on output.
|
// The type to convert to on output.
|
||||||
//num_t dt_on_output;
|
//num_t dt_on_output;
|
||||||
|
|
||||||
@@ -1439,11 +1446,18 @@ typedef struct cntx_s
|
|||||||
|
|
||||||
// -- Runtime type --
|
// -- Runtime type --
|
||||||
|
|
||||||
|
// NOTE: The order of these fields must be kept consistent with the definition
|
||||||
|
// of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h.
|
||||||
|
|
||||||
typedef struct rntm_s
|
typedef struct rntm_s
|
||||||
{
|
{
|
||||||
// "External" fields: these may be queried by the end-user.
|
// "External" fields: these may be queried by the end-user.
|
||||||
|
|
||||||
dim_t num_threads;
|
dim_t num_threads;
|
||||||
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
dim_t thrloop[ BLIS_NUM_LOOPS ];
|
||||||
|
bool_t pack_a; // enable/disable packing of left-hand matrix A.
|
||||||
|
bool_t pack_b; // enable/disable packing of right-hand matrix B.
|
||||||
|
bool_t l3_sup; // enable/disable small matrix handling in level-3 ops.
|
||||||
|
|
||||||
// "Internal" fields: these should not be exposed to the end-user.
|
// "Internal" fields: these should not be exposed to the end-user.
|
||||||
|
|
||||||
@@ -1453,9 +1467,6 @@ typedef struct rntm_s
|
|||||||
// The packing block allocator, which is attached in the l3 thread decorator.
|
// The packing block allocator, which is attached in the l3 thread decorator.
|
||||||
membrk_t* membrk;
|
membrk_t* membrk;
|
||||||
|
|
||||||
// A switch to enable/disable small/unpacked matrix handling in level-3 ops.
|
|
||||||
bool_t l3_sup;
|
|
||||||
|
|
||||||
} rntm_t;
|
} rntm_t;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -129,6 +129,8 @@ extern "C" {
|
|||||||
#include "bli_getopt.h"
|
#include "bli_getopt.h"
|
||||||
#include "bli_opid.h"
|
#include "bli_opid.h"
|
||||||
#include "bli_cntl.h"
|
#include "bli_cntl.h"
|
||||||
|
#include "bli_env.h"
|
||||||
|
#include "bli_pack.h"
|
||||||
#include "bli_info.h"
|
#include "bli_info.h"
|
||||||
#include "bli_arch.h"
|
#include "bli_arch.h"
|
||||||
#include "bli_cpuid.h"
|
#include "bli_cpuid.h"
|
||||||
|
|||||||
@@ -98,8 +98,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Some induced methods execute in multiple "stages". */ \
|
/* Some induced methods execute in multiple "stages". */ \
|
||||||
for ( i = 0; i < nstage; ++i ) \
|
for ( i = 0; i < nstage; ++i ) \
|
||||||
@@ -191,8 +191,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Some induced methods execute in multiple "stages". */ \
|
/* Some induced methods execute in multiple "stages". */ \
|
||||||
for ( i = 0; i < nstage; ++i ) \
|
for ( i = 0; i < nstage; ++i ) \
|
||||||
@@ -282,8 +282,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Some induced methods execute in multiple "stages". */ \
|
/* Some induced methods execute in multiple "stages". */ \
|
||||||
for ( i = 0; i < nstage; ++i ) \
|
for ( i = 0; i < nstage; ++i ) \
|
||||||
@@ -358,8 +358,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Some induced methods execute in multiple "stages". */ \
|
/* Some induced methods execute in multiple "stages". */ \
|
||||||
for ( i = 0; i < nstage; ++i ) \
|
for ( i = 0; i < nstage; ++i ) \
|
||||||
@@ -420,8 +420,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
{ \
|
{ \
|
||||||
/* NOTE: trsm cannot be implemented via any induced method that
|
/* NOTE: trsm cannot be implemented via any induced method that
|
||||||
|
|||||||
@@ -60,8 +60,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
func( alpha, a, b, beta, c, cntx, rntm ); \
|
func( alpha, a, b, beta, c, cntx, rntm ); \
|
||||||
}
|
}
|
||||||
@@ -96,8 +96,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
func( side, alpha, a, b, beta, c, cntx, rntm ); \
|
func( side, alpha, a, b, beta, c, cntx, rntm ); \
|
||||||
}
|
}
|
||||||
@@ -130,8 +130,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
func( alpha, a, beta, c, cntx, rntm ); \
|
func( alpha, a, beta, c, cntx, rntm ); \
|
||||||
}
|
}
|
||||||
@@ -163,8 +163,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
func( side, alpha, a, b, cntx, rntm ); \
|
func( side, alpha, a, b, cntx, rntm ); \
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -65,8 +65,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Invoke the operation's front end. */ \
|
/* Invoke the operation's front end. */ \
|
||||||
PASTEMAC(opname,_front) \
|
PASTEMAC(opname,_front) \
|
||||||
@@ -109,8 +109,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Invoke the operation's front end. */ \
|
/* Invoke the operation's front end. */ \
|
||||||
PASTEMAC(opname,_front) \
|
PASTEMAC(opname,_front) \
|
||||||
@@ -147,8 +147,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Invoke the operation's front end. */ \
|
/* Invoke the operation's front end. */ \
|
||||||
PASTEMAC(opname,_front) \
|
PASTEMAC(opname,_front) \
|
||||||
@@ -184,8 +184,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Invoke the operation's front end. */ \
|
/* Invoke the operation's front end. */ \
|
||||||
PASTEMAC(opname,_front) \
|
PASTEMAC(opname,_front) \
|
||||||
@@ -220,8 +220,8 @@ void PASTEMAC(opname,imeth) \
|
|||||||
/* Initialize a local runtime with global settings if necessary. Note
|
/* Initialize a local runtime with global settings if necessary. Note
|
||||||
that in the case that a runtime is passed in, we make a local copy. */ \
|
that in the case that a runtime is passed in, we make a local copy. */ \
|
||||||
rntm_t rntm_l; \
|
rntm_t rntm_l; \
|
||||||
if ( rntm == NULL ) { bli_thread_init_rntm( &rntm_l ); rntm = &rntm_l; } \
|
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } \
|
||||||
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
else { rntm_l = *rntm; rntm = &rntm_l; } \
|
||||||
\
|
\
|
||||||
/* Invoke the operation's front end. */ \
|
/* Invoke the operation's front end. */ \
|
||||||
PASTEMAC(opname,_front) \
|
PASTEMAC(opname,_front) \
|
||||||
|
|||||||
77
frame/thread/bli_l3_decor.h
Normal file
77
frame/thread/bli_l3_decor.h
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_DECOR_H
|
||||||
|
#define BLIS_L3_DECOR_H
|
||||||
|
|
||||||
|
// -- conventional definitions -------------------------------------------------
|
||||||
|
|
||||||
|
// Level-3 internal function type.
|
||||||
|
typedef void (*l3int_t)
|
||||||
|
(
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
);
|
||||||
|
|
||||||
|
// Level-3 thread decorator prototype.
|
||||||
|
void bli_l3_thread_decorator
|
||||||
|
(
|
||||||
|
l3int_t func,
|
||||||
|
opid_t family,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl
|
||||||
|
);
|
||||||
|
|
||||||
|
// Include definitions specific to the method of multithreading for the
|
||||||
|
// conventional code path.
|
||||||
|
#include "bli_l3_decor_single.h"
|
||||||
|
#include "bli_l3_decor_openmp.h"
|
||||||
|
#include "bli_l3_decor_pthreads.h"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
248
frame/thread/bli_l3_decor_openmp.c
Normal file
248
frame/thread/bli_l3_decor_openmp.c
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_OPENMP
|
||||||
|
|
||||||
|
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
||||||
|
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
||||||
|
// or no multithreading) we don't risk having an unresolved symbol.
|
||||||
|
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
||||||
|
|
||||||
|
//#define PRINT_THRINFO
|
||||||
|
|
||||||
|
void bli_l3_thread_decorator
|
||||||
|
(
|
||||||
|
l3int_t func,
|
||||||
|
opid_t family,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||||
|
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||||
|
// we decided to transmit them via the schema field in the obj_t's
|
||||||
|
// rather than pass them in as function parameters. Once the values
|
||||||
|
// have been read, we immediately reset them back to their expected
|
||||||
|
// values for unpacked objects.
|
||||||
|
pack_t schema_a = bli_obj_pack_schema( a );
|
||||||
|
pack_t schema_b = bli_obj_pack_schema( b );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||||
|
|
||||||
|
// Query the total number of threads from the rntm_t object.
|
||||||
|
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||||
|
|
||||||
|
#ifdef PRINT_THRINFO
|
||||||
|
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// NOTE: The sba was initialized in bli_init().
|
||||||
|
|
||||||
|
// Check out an array_t from the small block allocator. This is done
|
||||||
|
// with an internal lock to ensure only one application thread accesses
|
||||||
|
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||||
|
// resize the array_t, if necessary.
|
||||||
|
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||||
|
|
||||||
|
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||||
|
// this up-front only so that we have the rntm_t.sba_pool field
|
||||||
|
// initialized and ready for the global communicator creation below.
|
||||||
|
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||||
|
|
||||||
|
// Set the packing block allocator field of the rntm. This will be
|
||||||
|
// inherited by all of the child threads when they make local copies of
|
||||||
|
// the rntm below.
|
||||||
|
bli_membrk_rntm_set_membrk( rntm );
|
||||||
|
|
||||||
|
// Allocate a global communicator for the root thrinfo_t structures.
|
||||||
|
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||||
|
|
||||||
|
|
||||||
|
_Pragma( "omp parallel num_threads(n_threads)" )
|
||||||
|
{
|
||||||
|
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||||
|
// necessary since we want each thread to be able to track its own
|
||||||
|
// small block pool_t as it executes down the function stack.
|
||||||
|
rntm_t rntm_l = *rntm;
|
||||||
|
rntm_t* restrict rntm_p = &rntm_l;
|
||||||
|
|
||||||
|
// Query the thread's id from OpenMP.
|
||||||
|
const dim_t tid = omp_get_thread_num();
|
||||||
|
|
||||||
|
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
||||||
|
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
||||||
|
|
||||||
|
// Use the thread id to access the appropriate pool_t* within the
|
||||||
|
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||||
|
// If the pool_t* element within the array_t is NULL, it will first
|
||||||
|
// be allocated/initialized.
|
||||||
|
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||||
|
|
||||||
|
|
||||||
|
obj_t a_t, b_t, c_t;
|
||||||
|
cntl_t* cntl_use;
|
||||||
|
thrinfo_t* thread;
|
||||||
|
|
||||||
|
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||||
|
// we pass down the algorithmic function stack. Making thread-local
|
||||||
|
// alaises is highly recommended in case a thread needs to change any
|
||||||
|
// of the properties of an object without affecting other threads'
|
||||||
|
// objects.
|
||||||
|
bli_obj_alias_to( a, &a_t );
|
||||||
|
bli_obj_alias_to( b, &b_t );
|
||||||
|
bli_obj_alias_to( c, &c_t );
|
||||||
|
|
||||||
|
// Create a default control tree for the operation, if needed.
|
||||||
|
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||||
|
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||||
|
|
||||||
|
// Create the root node of the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
func
|
||||||
|
(
|
||||||
|
alpha,
|
||||||
|
&a_t,
|
||||||
|
&b_t,
|
||||||
|
beta,
|
||||||
|
&c_t,
|
||||||
|
cntx,
|
||||||
|
rntm_p,
|
||||||
|
cntl_use,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
bli_thrinfo_grow_tree
|
||||||
|
(
|
||||||
|
rntm_p,
|
||||||
|
cntl_use,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Free the thread's local control tree.
|
||||||
|
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||||
|
|
||||||
|
#ifdef PRINT_THRINFO
|
||||||
|
threads[tid] = thread;
|
||||||
|
#else
|
||||||
|
// Free the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_free( rntm_p, thread );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// We shouldn't free the global communicator since it was already freed
|
||||||
|
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||||
|
// (called above).
|
||||||
|
|
||||||
|
#ifdef PRINT_THRINFO
|
||||||
|
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
|
||||||
|
else bli_l3_thrinfo_print_trsm_paths( threads );
|
||||||
|
exit(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Check the array_t back into the small block allocator. Similar to the
|
||||||
|
// check-out, this is done using a lock embedded within the sba to ensure
|
||||||
|
// mutual exclusion.
|
||||||
|
bli_sba_checkin_array( array );
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
void bli_l3_thread_decorator_thread_check
|
||||||
|
(
|
||||||
|
dim_t n_threads,
|
||||||
|
dim_t tid,
|
||||||
|
thrcomm_t* gl_comm,
|
||||||
|
rntm_t* rntm
|
||||||
|
)
|
||||||
|
{
|
||||||
|
dim_t n_threads_real = omp_get_num_threads();
|
||||||
|
|
||||||
|
// Check if the number of OpenMP threads created within this parallel
|
||||||
|
// region is different from the number of threads that were requested
|
||||||
|
// of BLIS. This inequality may trigger when, for example, the
|
||||||
|
// following conditions are satisfied:
|
||||||
|
// - an application is executing an OpenMP parallel region in which
|
||||||
|
// BLIS is invoked,
|
||||||
|
// - BLIS is configured for multithreading via OpenMP,
|
||||||
|
// - OMP_NUM_THREADS = t > 1,
|
||||||
|
// - the number of threads requested of BLIS (regardless of method)
|
||||||
|
// is p <= t,
|
||||||
|
// - OpenMP nesting is disabled.
|
||||||
|
// In this situation, the application spawns t threads. Each application
|
||||||
|
// thread calls gemm (for example). Each gemm will attempt to spawn p
|
||||||
|
// threads via OpenMP. However, since nesting is disabled, the OpenMP
|
||||||
|
// implementation finds that t >= p threads are already spawned, and
|
||||||
|
// thus it doesn't spawn *any* additional threads for each gemm.
|
||||||
|
if ( n_threads_real != n_threads )
|
||||||
|
{
|
||||||
|
// If the number of threads active in the current region is not
|
||||||
|
// equal to the number requested of BLIS, we then only continue
|
||||||
|
// if the number of threads in the current region is 1. If, for
|
||||||
|
// example, BLIS requested 4 threads but only got 3, then we
|
||||||
|
// abort().
|
||||||
|
//if ( tid == 0 )
|
||||||
|
//{
|
||||||
|
if ( n_threads_real != 1 )
|
||||||
|
{
|
||||||
|
bli_print_msg( "A different number of threads was "
|
||||||
|
"created than was requested.",
|
||||||
|
__FILE__, __LINE__ );
|
||||||
|
bli_abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
//n_threads = 1; // not needed since it has no effect?
|
||||||
|
bli_thrcomm_init( 1, gl_comm );
|
||||||
|
bli_rntm_set_num_threads_only( 1, rntm );
|
||||||
|
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
|
||||||
|
//}
|
||||||
|
|
||||||
|
// Synchronize all threads and continue.
|
||||||
|
_Pragma( "omp barrier" )
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
53
frame/thread/bli_l3_decor_openmp.h
Normal file
53
frame/thread/bli_l3_decor_openmp.h
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_DECOR_OPENMP_H
|
||||||
|
#define BLIS_L3_DECOR_OPENMP_H
|
||||||
|
|
||||||
|
// Definitions specific to situations when OpenMP multithreading is enabled.
|
||||||
|
#ifdef BLIS_ENABLE_OPENMP
|
||||||
|
|
||||||
|
void bli_l3_thread_decorator_thread_check
|
||||||
|
(
|
||||||
|
dim_t n_threads,
|
||||||
|
dim_t tid,
|
||||||
|
thrcomm_t* gl_comm,
|
||||||
|
rntm_t* rntm
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
252
frame/thread/bli_l3_decor_pthreads.c
Normal file
252
frame/thread/bli_l3_decor_pthreads.c
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_PTHREADS
|
||||||
|
|
||||||
|
// A data structure to assist in passing operands to additional threads.
|
||||||
|
typedef struct thread_data
|
||||||
|
{
|
||||||
|
l3int_t func;
|
||||||
|
opid_t family;
|
||||||
|
pack_t schema_a;
|
||||||
|
pack_t schema_b;
|
||||||
|
obj_t* alpha;
|
||||||
|
obj_t* a;
|
||||||
|
obj_t* b;
|
||||||
|
obj_t* beta;
|
||||||
|
obj_t* c;
|
||||||
|
cntx_t* cntx;
|
||||||
|
rntm_t* rntm;
|
||||||
|
cntl_t* cntl;
|
||||||
|
dim_t tid;
|
||||||
|
thrcomm_t* gl_comm;
|
||||||
|
array_t* array;
|
||||||
|
} thread_data_t;
|
||||||
|
|
||||||
|
// Entry point for additional threads
|
||||||
|
void* bli_l3_thread_entry( void* data_void )
|
||||||
|
{
|
||||||
|
thread_data_t* data = data_void;
|
||||||
|
|
||||||
|
l3int_t func = data->func;
|
||||||
|
opid_t family = data->family;
|
||||||
|
pack_t schema_a = data->schema_a;
|
||||||
|
pack_t schema_b = data->schema_b;
|
||||||
|
obj_t* alpha = data->alpha;
|
||||||
|
obj_t* a = data->a;
|
||||||
|
obj_t* b = data->b;
|
||||||
|
obj_t* beta = data->beta;
|
||||||
|
obj_t* c = data->c;
|
||||||
|
cntx_t* cntx = data->cntx;
|
||||||
|
rntm_t* rntm = data->rntm;
|
||||||
|
cntl_t* cntl = data->cntl;
|
||||||
|
dim_t tid = data->tid;
|
||||||
|
array_t* array = data->array;
|
||||||
|
thrcomm_t* gl_comm = data->gl_comm;
|
||||||
|
|
||||||
|
// Create a thread-local copy of the master thread's rntm_t. This is
|
||||||
|
// necessary since we want each thread to be able to track its own
|
||||||
|
// small block pool_t as it executes down the function stack.
|
||||||
|
rntm_t rntm_l = *rntm;
|
||||||
|
rntm_t* restrict rntm_p = &rntm_l;
|
||||||
|
|
||||||
|
// Use the thread id to access the appropriate pool_t* within the
|
||||||
|
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||||
|
// If the pool_t* element within the array_t is NULL, it will first
|
||||||
|
// be allocated/initialized.
|
||||||
|
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||||
|
|
||||||
|
obj_t a_t, b_t, c_t;
|
||||||
|
cntl_t* cntl_use;
|
||||||
|
thrinfo_t* thread;
|
||||||
|
|
||||||
|
// Alias thread-local copies of A, B, and C. These will be the objects
|
||||||
|
// we pass down the algorithmic function stack. Making thread-local
|
||||||
|
// alaises is highly recommended in case a thread needs to change any
|
||||||
|
// of the properties of an object without affecting other threads'
|
||||||
|
// objects.
|
||||||
|
bli_obj_alias_to( a, &a_t );
|
||||||
|
bli_obj_alias_to( b, &b_t );
|
||||||
|
bli_obj_alias_to( c, &c_t );
|
||||||
|
|
||||||
|
// Create a default control tree for the operation, if needed.
|
||||||
|
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||||
|
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
||||||
|
|
||||||
|
// Create the root node of the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||||
|
|
||||||
|
func
|
||||||
|
(
|
||||||
|
alpha,
|
||||||
|
&a_t,
|
||||||
|
&b_t,
|
||||||
|
beta,
|
||||||
|
&c_t,
|
||||||
|
cntx,
|
||||||
|
rntm_p,
|
||||||
|
cntl_use,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
|
||||||
|
// Free the thread's local control tree.
|
||||||
|
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||||
|
|
||||||
|
// Free the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_free( rntm_p, thread );
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void bli_l3_thread_decorator
|
||||||
|
(
|
||||||
|
l3int_t func,
|
||||||
|
opid_t family,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||||
|
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||||
|
// we decided to transmit them via the schema field in the obj_t's
|
||||||
|
// rather than pass them in as function parameters. Once the values
|
||||||
|
// have been read, we immediately reset them back to their expected
|
||||||
|
// values for unpacked objects.
|
||||||
|
pack_t schema_a = bli_obj_pack_schema( a );
|
||||||
|
pack_t schema_b = bli_obj_pack_schema( b );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||||
|
|
||||||
|
// Query the total number of threads from the context.
|
||||||
|
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
||||||
|
|
||||||
|
// NOTE: The sba was initialized in bli_init().
|
||||||
|
|
||||||
|
// Check out an array_t from the small block allocator. This is done
|
||||||
|
// with an internal lock to ensure only one application thread accesses
|
||||||
|
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||||
|
// resize the array_t, if necessary.
|
||||||
|
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||||
|
|
||||||
|
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||||
|
// this up-front only so that we have the rntm_t.sba_pool field
|
||||||
|
// initialized and ready for the global communicator creation below.
|
||||||
|
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||||
|
|
||||||
|
// Set the packing block allocator field of the rntm. This will be
|
||||||
|
// inherited by all of the child threads when they make local copies of
|
||||||
|
// the rntm below.
|
||||||
|
bli_membrk_rntm_set_membrk( rntm );
|
||||||
|
|
||||||
|
// Allocate a global communicator for the root thrinfo_t structures.
|
||||||
|
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||||
|
|
||||||
|
// Allocate an array of pthread objects and auxiliary data structs to pass
|
||||||
|
// to the thread entry functions.
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||||
|
printf( "bli_l3_thread_decorator().pth: " );
|
||||||
|
#endif
|
||||||
|
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||||
|
printf( "bli_l3_thread_decorator().pth: " );
|
||||||
|
#endif
|
||||||
|
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
||||||
|
|
||||||
|
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
||||||
|
// can spawn all other threads before proceeding with its own computation.
|
||||||
|
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
||||||
|
{
|
||||||
|
// Set up thread data for additional threads (beyond thread 0).
|
||||||
|
datas[tid].func = func;
|
||||||
|
datas[tid].family = family;
|
||||||
|
datas[tid].schema_a = schema_a;
|
||||||
|
datas[tid].schema_b = schema_b;
|
||||||
|
datas[tid].alpha = alpha;
|
||||||
|
datas[tid].a = a;
|
||||||
|
datas[tid].b = b;
|
||||||
|
datas[tid].beta = beta;
|
||||||
|
datas[tid].c = c;
|
||||||
|
datas[tid].cntx = cntx;
|
||||||
|
datas[tid].rntm = rntm;
|
||||||
|
datas[tid].cntl = cntl;
|
||||||
|
datas[tid].tid = tid;
|
||||||
|
datas[tid].gl_comm = gl_comm;
|
||||||
|
datas[tid].array = array;
|
||||||
|
|
||||||
|
// Spawn additional threads for ids greater than 1.
|
||||||
|
if ( tid != 0 )
|
||||||
|
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
|
||||||
|
else
|
||||||
|
bli_l3_thread_entry( ( void* )(&datas[0]) );
|
||||||
|
}
|
||||||
|
|
||||||
|
// We shouldn't free the global communicator since it was already freed
|
||||||
|
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||||
|
// (called from the thread entry function).
|
||||||
|
|
||||||
|
// Thread 0 waits for additional threads to finish.
|
||||||
|
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
||||||
|
{
|
||||||
|
bli_pthread_join( pthreads[tid], NULL );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the array_t back into the small block allocator. Similar to the
|
||||||
|
// check-out, this is done using a lock embedded within the sba to ensure
|
||||||
|
// mutual exclusion.
|
||||||
|
bli_sba_checkin_array( array );
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||||
|
printf( "bli_l3_thread_decorator().pth: " );
|
||||||
|
#endif
|
||||||
|
bli_free_intl( pthreads );
|
||||||
|
|
||||||
|
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||||
|
printf( "bli_l3_thread_decorator().pth: " );
|
||||||
|
#endif
|
||||||
|
bli_free_intl( datas );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
47
frame/thread/bli_l3_decor_pthreads.h
Normal file
47
frame/thread/bli_l3_decor_pthreads.h
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_DECOR_PTHREADS_H
|
||||||
|
#define BLIS_L3_DECOR_PTHREADS_H
|
||||||
|
|
||||||
|
// Definitions specific to situations when POSIX multithreading is enabled.
|
||||||
|
#ifdef BLIS_ENABLE_PTHREADS
|
||||||
|
|
||||||
|
// Thread entry point prototype.
|
||||||
|
void* bli_l3_thread_entry( void* data_void );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
150
frame/thread/bli_l3_decor_single.c
Normal file
150
frame/thread/bli_l3_decor_single.c
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||||
|
|
||||||
|
void bli_l3_thread_decorator
|
||||||
|
(
|
||||||
|
l3int_t func,
|
||||||
|
opid_t family,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl
|
||||||
|
)
|
||||||
|
{
|
||||||
|
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||||
|
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||||
|
// we decided to transmit them via the schema field in the obj_t's
|
||||||
|
// rather than pass them in as function parameters. Once the values
|
||||||
|
// have been read, we immediately reset them back to their expected
|
||||||
|
// values for unpacked objects.
|
||||||
|
pack_t schema_a = bli_obj_pack_schema( a );
|
||||||
|
pack_t schema_b = bli_obj_pack_schema( b );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||||
|
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||||
|
|
||||||
|
// For sequential execution, we use only one thread.
|
||||||
|
const dim_t n_threads = 1;
|
||||||
|
|
||||||
|
// NOTE: The sba was initialized in bli_init().
|
||||||
|
|
||||||
|
// Check out an array_t from the small block allocator. This is done
|
||||||
|
// with an internal lock to ensure only one application thread accesses
|
||||||
|
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||||
|
// resize the array_t, if necessary.
|
||||||
|
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||||
|
|
||||||
|
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||||
|
// this up-front only so that we can create the global comm below.
|
||||||
|
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||||
|
|
||||||
|
// Set the packing block allocator field of the rntm.
|
||||||
|
bli_membrk_rntm_set_membrk( rntm );
|
||||||
|
|
||||||
|
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||||
|
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
// NOTE: We don't need to create another copy of the rntm_t since
|
||||||
|
// it was already copied in one of the high-level oapi functions.
|
||||||
|
rntm_t* restrict rntm_p = rntm;
|
||||||
|
|
||||||
|
cntl_t* cntl_use;
|
||||||
|
thrinfo_t* thread;
|
||||||
|
|
||||||
|
const dim_t tid = 0;
|
||||||
|
|
||||||
|
// Use the thread id to access the appropriate pool_t* within the
|
||||||
|
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||||
|
// If the pool_t* element within the array_t is NULL, it will first
|
||||||
|
// be allocated/initialized.
|
||||||
|
// NOTE: This is commented out because, in the single-threaded case,
|
||||||
|
// this is redundant since it's already been done above.
|
||||||
|
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||||
|
|
||||||
|
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||||
|
// need to alias objects for A, B, and C since they were already aliased
|
||||||
|
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||||
|
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||||
|
// consistently providing local aliases, we can then eliminate aliasing
|
||||||
|
// elsewhere.
|
||||||
|
|
||||||
|
// Create a default control tree for the operation, if needed.
|
||||||
|
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||||
|
a, b, c, rntm_p, cntl, &cntl_use );
|
||||||
|
|
||||||
|
// Create the root node of the thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||||
|
|
||||||
|
func
|
||||||
|
(
|
||||||
|
alpha,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
beta,
|
||||||
|
c,
|
||||||
|
cntx,
|
||||||
|
rntm_p,
|
||||||
|
cntl_use,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
|
||||||
|
// Free the thread's local control tree.
|
||||||
|
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||||
|
|
||||||
|
// Free the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_free( rntm_p, thread );
|
||||||
|
}
|
||||||
|
|
||||||
|
// We shouldn't free the global communicator since it was already freed
|
||||||
|
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||||
|
// (called above).
|
||||||
|
|
||||||
|
// Check the array_t back into the small block allocator. Similar to the
|
||||||
|
// check-out, this is done using a lock embedded within the sba to ensure
|
||||||
|
// mutual exclusion.
|
||||||
|
bli_sba_checkin_array( array );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
44
frame/thread/bli_l3_decor_single.h
Normal file
44
frame/thread/bli_l3_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_DECOR_SINGLE_H
|
||||||
|
#define BLIS_L3_DECOR_SINGLE_H
|
||||||
|
|
||||||
|
// Definitions specific to situations when multithreading is disabled.
|
||||||
|
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
78
frame/thread/bli_l3_sup_decor.h
Normal file
78
frame/thread/bli_l3_sup_decor.h
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_SUP_DECOR_H
|
||||||
|
#define BLIS_L3_SUP_DECOR_H
|
||||||
|
|
||||||
|
// -- sup definitions ----------------------------------------------------------
|
||||||
|
|
||||||
|
// Level-3 sup internal function type.
|
||||||
|
typedef err_t (*l3supint_t)
|
||||||
|
(
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm,
|
||||||
|
cntl_t* cntl,
|
||||||
|
thrinfo_t* thread
|
||||||
|
);
|
||||||
|
|
||||||
|
// Level-3 sup thread decorator prototype.
|
||||||
|
err_t bli_l3_sup_thread_decorator
|
||||||
|
(
|
||||||
|
l3supint_t func,
|
||||||
|
opid_t family,
|
||||||
|
//pack_t schema_a,
|
||||||
|
//pack_t schema_b,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm
|
||||||
|
);
|
||||||
|
|
||||||
|
// Include definitions specific to the method of multithreading for the
|
||||||
|
// sup code path.
|
||||||
|
#include "bli_l3_sup_decor_single.h"
|
||||||
|
//#include "bli_l3_sup_decor_openmp.h"
|
||||||
|
//#include "bli_l3_sup_decor_pthreads.h"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
183
frame/thread/bli_l3_sup_decor_single.c
Normal file
183
frame/thread/bli_l3_sup_decor_single.c
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "blis.h"
|
||||||
|
|
||||||
|
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||||
|
|
||||||
|
err_t bli_l3_sup_thread_decorator
|
||||||
|
(
|
||||||
|
l3supint_t func,
|
||||||
|
opid_t family,
|
||||||
|
//pack_t schema_a,
|
||||||
|
//pack_t schema_b,
|
||||||
|
obj_t* alpha,
|
||||||
|
obj_t* a,
|
||||||
|
obj_t* b,
|
||||||
|
obj_t* beta,
|
||||||
|
obj_t* c,
|
||||||
|
cntx_t* cntx,
|
||||||
|
rntm_t* rntm
|
||||||
|
)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
return
|
||||||
|
bli_gemmsup_int
|
||||||
|
(
|
||||||
|
alpha,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
beta,
|
||||||
|
c,
|
||||||
|
cntx,
|
||||||
|
rntm,
|
||||||
|
0
|
||||||
|
);
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// This is part of a hack to support mixed domain in bli_gemm_front().
|
||||||
|
// Sometimes we need to specify a non-standard schema for A and B, and
|
||||||
|
// we decided to transmit them via the schema field in the obj_t's
|
||||||
|
// rather than pass them in as function parameters. Once the values
|
||||||
|
// have been read, we immediately reset them back to their expected
|
||||||
|
// values for unpacked objects.
|
||||||
|
//pack_t schema_a = bli_obj_pack_schema( a );
|
||||||
|
//pack_t schema_b = bli_obj_pack_schema( b );
|
||||||
|
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
||||||
|
//bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
||||||
|
|
||||||
|
// For sequential execution, we use only one thread.
|
||||||
|
const dim_t n_threads = 1;
|
||||||
|
|
||||||
|
// NOTE: The sba was initialized in bli_init().
|
||||||
|
|
||||||
|
// Check out an array_t from the small block allocator. This is done
|
||||||
|
// with an internal lock to ensure only one application thread accesses
|
||||||
|
// the sba at a time. bli_sba_checkout_array() will also automatically
|
||||||
|
// resize the array_t, if necessary.
|
||||||
|
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
||||||
|
|
||||||
|
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
||||||
|
// this up-front only so that we can create the global comm below.
|
||||||
|
bli_sba_rntm_set_pool( 0, array, rntm );
|
||||||
|
|
||||||
|
// Set the packing block allocator field of the rntm.
|
||||||
|
bli_membrk_rntm_set_membrk( rntm );
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Allcoate a global communicator for the root thrinfo_t structures.
|
||||||
|
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
// NOTE: We don't need to create another copy of the rntm_t since
|
||||||
|
// it was already copied in one of the high-level oapi functions.
|
||||||
|
rntm_t* restrict rntm_p = rntm;
|
||||||
|
|
||||||
|
cntl_t* cntl_use = NULL;
|
||||||
|
//thrinfo_t* thread = NULL;
|
||||||
|
thrinfo_t* thread = &BLIS_PACKM_SINGLE_THREADED;
|
||||||
|
|
||||||
|
const dim_t tid = 0;
|
||||||
|
|
||||||
|
// Use the thread id to access the appropriate pool_t* within the
|
||||||
|
// array_t, and use it to set the sba_pool field within the rntm_t.
|
||||||
|
// If the pool_t* element within the array_t is NULL, it will first
|
||||||
|
// be allocated/initialized.
|
||||||
|
// NOTE: This is commented out because, in the single-threaded case,
|
||||||
|
// this is redundant since it's already been done above.
|
||||||
|
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
||||||
|
|
||||||
|
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
||||||
|
// need to alias objects for A, B, and C since they were already aliased
|
||||||
|
// in bli_*_front(). However, we may add aliasing here in the future so
|
||||||
|
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
||||||
|
// consistently providing local aliases, we can then eliminate aliasing
|
||||||
|
// elsewhere.
|
||||||
|
|
||||||
|
// Create a default control tree for the operation, if needed.
|
||||||
|
//bli_l3_cntl_create_if( family, schema_a, schema_b,
|
||||||
|
// a, b, c, rntm_p, cntl, &cntl_use );
|
||||||
|
#if 0
|
||||||
|
cntl_use = bli_gemm_cntl_create( rntm_p, family, schema_a, schema_b );
|
||||||
|
|
||||||
|
// Create the root node of the thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
( void )tid;
|
||||||
|
|
||||||
|
func
|
||||||
|
(
|
||||||
|
alpha,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
beta,
|
||||||
|
c,
|
||||||
|
cntx,
|
||||||
|
rntm_p,
|
||||||
|
cntl_use,
|
||||||
|
thread
|
||||||
|
);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// Free the thread's local control tree.
|
||||||
|
//bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
||||||
|
bli_gemm_cntl_free( rntm_p, cntl_use, thread );
|
||||||
|
|
||||||
|
// Free the current thread's thrinfo_t structure.
|
||||||
|
bli_l3_thrinfo_free( rntm_p, thread );
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// We shouldn't free the global communicator since it was already freed
|
||||||
|
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
||||||
|
// (called above).
|
||||||
|
|
||||||
|
// Check the array_t back into the small block allocator. Similar to the
|
||||||
|
// check-out, this is done using a lock embedded within the sba to ensure
|
||||||
|
// mutual exclusion.
|
||||||
|
bli_sba_checkin_array( array );
|
||||||
|
|
||||||
|
return BLIS_SUCCESS;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
44
frame/thread/bli_l3_sup_decor_single.h
Normal file
44
frame/thread/bli_l3_sup_decor_single.h
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
|
||||||
|
BLIS
|
||||||
|
An object-based framework for developing high-performance BLAS-like
|
||||||
|
libraries.
|
||||||
|
|
||||||
|
Copyright (C) 2014, The University of Texas at Austin
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
- Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
- Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef BLIS_L3_SUP_DECOR_SINGLE_H
|
||||||
|
#define BLIS_L3_SUP_DECOR_SINGLE_H
|
||||||
|
|
||||||
|
// Definitions specific to situations when multithreading is disabled.
|
||||||
|
#ifndef BLIS_ENABLE_MULTITHREADING
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
@@ -43,10 +43,6 @@
|
|||||||
#include "bli_thrcomm_pthreads.h"
|
#include "bli_thrcomm_pthreads.h"
|
||||||
|
|
||||||
|
|
||||||
// thread entry point prototype.
|
|
||||||
void* bli_l3_thread_entry( void* data_void );
|
|
||||||
|
|
||||||
|
|
||||||
// thrcomm_t query (field only)
|
// thrcomm_t query (field only)
|
||||||
|
|
||||||
static dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
|
static dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
|
||||||
|
|||||||
@@ -214,212 +214,5 @@ void bli_thrcomm_tree_barrier( barrier_t* barack )
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
|
||||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
|
||||||
// or no multithreading) we don't risk having an unresolved symbol.
|
|
||||||
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
|
||||||
|
|
||||||
//#define PRINT_THRINFO
|
|
||||||
|
|
||||||
void bli_l3_thread_decorator
|
|
||||||
(
|
|
||||||
l3int_t func,
|
|
||||||
opid_t family,
|
|
||||||
obj_t* alpha,
|
|
||||||
obj_t* a,
|
|
||||||
obj_t* b,
|
|
||||||
obj_t* beta,
|
|
||||||
obj_t* c,
|
|
||||||
cntx_t* cntx,
|
|
||||||
rntm_t* rntm,
|
|
||||||
cntl_t* cntl
|
|
||||||
)
|
|
||||||
{
|
|
||||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
|
||||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
|
||||||
// we decided to transmit them via the schema field in the obj_t's
|
|
||||||
// rather than pass them in as function parameters. Once the values
|
|
||||||
// have been read, we immediately reset them back to their expected
|
|
||||||
// values for unpacked objects.
|
|
||||||
pack_t schema_a = bli_obj_pack_schema( a );
|
|
||||||
pack_t schema_b = bli_obj_pack_schema( b );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
|
||||||
|
|
||||||
// Query the total number of threads from the rntm_t object.
|
|
||||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
|
||||||
|
|
||||||
#ifdef PRINT_THRINFO
|
|
||||||
thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// NOTE: The sba was initialized in bli_init().
|
|
||||||
|
|
||||||
// Check out an array_t from the small block allocator. This is done
|
|
||||||
// with an internal lock to ensure only one application thread accesses
|
|
||||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
|
||||||
// resize the array_t, if necessary.
|
|
||||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
|
||||||
|
|
||||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
|
||||||
// this up-front only so that we have the rntm_t.sba_pool field
|
|
||||||
// initialized and ready for the global communicator creation below.
|
|
||||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
|
||||||
|
|
||||||
// Set the packing block allocator field of the rntm. This will be
|
|
||||||
// inherited by all of the child threads when they make local copies of
|
|
||||||
// the rntm below.
|
|
||||||
bli_membrk_rntm_set_membrk( rntm );
|
|
||||||
|
|
||||||
// Allocate a global communicator for the root thrinfo_t structures.
|
|
||||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
|
||||||
|
|
||||||
|
|
||||||
_Pragma( "omp parallel num_threads(n_threads)" )
|
|
||||||
{
|
|
||||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
|
||||||
// necessary since we want each thread to be able to track its own
|
|
||||||
// small block pool_t as it executes down the function stack.
|
|
||||||
rntm_t rntm_l = *rntm;
|
|
||||||
rntm_t* restrict rntm_p = &rntm_l;
|
|
||||||
|
|
||||||
// Query the thread's id from OpenMP.
|
|
||||||
const dim_t tid = omp_get_thread_num();
|
|
||||||
|
|
||||||
// Check for a somewhat obscure OpenMP thread-mistmatch issue.
|
|
||||||
bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
|
|
||||||
|
|
||||||
// Use the thread id to access the appropriate pool_t* within the
|
|
||||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
|
||||||
// If the pool_t* element within the array_t is NULL, it will first
|
|
||||||
// be allocated/initialized.
|
|
||||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
|
||||||
|
|
||||||
|
|
||||||
obj_t a_t, b_t, c_t;
|
|
||||||
cntl_t* cntl_use;
|
|
||||||
thrinfo_t* thread;
|
|
||||||
|
|
||||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
|
||||||
// we pass down the algorithmic function stack. Making thread-local
|
|
||||||
// alaises is highly recommended in case a thread needs to change any
|
|
||||||
// of the properties of an object without affecting other threads'
|
|
||||||
// objects.
|
|
||||||
bli_obj_alias_to( a, &a_t );
|
|
||||||
bli_obj_alias_to( b, &b_t );
|
|
||||||
bli_obj_alias_to( c, &c_t );
|
|
||||||
|
|
||||||
// Create a default control tree for the operation, if needed.
|
|
||||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
|
||||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
|
||||||
|
|
||||||
// Create the root node of the current thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
func
|
|
||||||
(
|
|
||||||
alpha,
|
|
||||||
&a_t,
|
|
||||||
&b_t,
|
|
||||||
beta,
|
|
||||||
&c_t,
|
|
||||||
cntx,
|
|
||||||
rntm_p,
|
|
||||||
cntl_use,
|
|
||||||
thread
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
bli_thrinfo_grow_tree
|
|
||||||
(
|
|
||||||
rntm_p,
|
|
||||||
cntl_use,
|
|
||||||
thread
|
|
||||||
);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Free the thread's local control tree.
|
|
||||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
|
||||||
|
|
||||||
#ifdef PRINT_THRINFO
|
|
||||||
threads[tid] = thread;
|
|
||||||
#else
|
|
||||||
// Free the current thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_free( rntm_p, thread );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// We shouldn't free the global communicator since it was already freed
|
|
||||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
|
||||||
// (called above).
|
|
||||||
|
|
||||||
#ifdef PRINT_THRINFO
|
|
||||||
if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
|
|
||||||
else bli_l3_thrinfo_print_trsm_paths( threads );
|
|
||||||
exit(1);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Check the array_t back into the small block allocator. Similar to the
|
|
||||||
// check-out, this is done using a lock embedded within the sba to ensure
|
|
||||||
// mutual exclusion.
|
|
||||||
bli_sba_checkin_array( array );
|
|
||||||
}
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
void bli_l3_thread_decorator_thread_check
|
|
||||||
(
|
|
||||||
dim_t n_threads,
|
|
||||||
dim_t tid,
|
|
||||||
thrcomm_t* gl_comm,
|
|
||||||
rntm_t* rntm
|
|
||||||
)
|
|
||||||
{
|
|
||||||
dim_t n_threads_real = omp_get_num_threads();
|
|
||||||
|
|
||||||
// Check if the number of OpenMP threads created within this parallel
|
|
||||||
// region is different from the number of threads that were requested
|
|
||||||
// of BLIS. This inequality may trigger when, for example, the
|
|
||||||
// following conditions are satisfied:
|
|
||||||
// - an application is executing an OpenMP parallel region in which
|
|
||||||
// BLIS is invoked,
|
|
||||||
// - BLIS is configured for multithreading via OpenMP,
|
|
||||||
// - OMP_NUM_THREADS = t > 1,
|
|
||||||
// - the number of threads requested of BLIS (regardless of method)
|
|
||||||
// is p <= t,
|
|
||||||
// - OpenMP nesting is disabled.
|
|
||||||
// In this situation, the application spawns t threads. Each application
|
|
||||||
// thread calls gemm (for example). Each gemm will attempt to spawn p
|
|
||||||
// threads via OpenMP. However, since nesting is disabled, the OpenMP
|
|
||||||
// implementation finds that t >= p threads are already spawned, and
|
|
||||||
// thus it doesn't spawn *any* additional threads for each gemm.
|
|
||||||
if ( n_threads_real != n_threads )
|
|
||||||
{
|
|
||||||
// If the number of threads active in the current region is not
|
|
||||||
// equal to the number requested of BLIS, we then only continue
|
|
||||||
// if the number of threads in the current region is 1. If, for
|
|
||||||
// example, BLIS requested 4 threads but only got 3, then we
|
|
||||||
// abort().
|
|
||||||
//if ( tid == 0 )
|
|
||||||
//{
|
|
||||||
if ( n_threads_real != 1 )
|
|
||||||
{
|
|
||||||
bli_print_msg( "A different number of threads was "
|
|
||||||
"created than was requested.",
|
|
||||||
__FILE__, __LINE__ );
|
|
||||||
bli_abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
//n_threads = 1; // not needed since it has no effect?
|
|
||||||
bli_thrcomm_init( 1, gl_comm );
|
|
||||||
bli_rntm_set_num_threads_only( 1, rntm );
|
|
||||||
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
|
|
||||||
//}
|
|
||||||
|
|
||||||
// Synchronize all threads and continue.
|
|
||||||
_Pragma( "omp barrier" )
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|||||||
@@ -79,14 +79,6 @@ void bli_thrcomm_tree_barrier_free( barrier_t* barrier );
|
|||||||
void bli_thrcomm_tree_barrier( barrier_t* barack );
|
void bli_thrcomm_tree_barrier( barrier_t* barack );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void bli_l3_thread_decorator_thread_check
|
|
||||||
(
|
|
||||||
dim_t n_threads,
|
|
||||||
dim_t tid,
|
|
||||||
thrcomm_t* gl_comm,
|
|
||||||
rntm_t* rntm
|
|
||||||
);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -138,217 +138,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// A data structure to assist in passing operands to additional threads.
|
|
||||||
typedef struct thread_data
|
|
||||||
{
|
|
||||||
l3int_t func;
|
|
||||||
opid_t family;
|
|
||||||
pack_t schema_a;
|
|
||||||
pack_t schema_b;
|
|
||||||
obj_t* alpha;
|
|
||||||
obj_t* a;
|
|
||||||
obj_t* b;
|
|
||||||
obj_t* beta;
|
|
||||||
obj_t* c;
|
|
||||||
cntx_t* cntx;
|
|
||||||
rntm_t* rntm;
|
|
||||||
cntl_t* cntl;
|
|
||||||
dim_t tid;
|
|
||||||
thrcomm_t* gl_comm;
|
|
||||||
array_t* array;
|
|
||||||
} thread_data_t;
|
|
||||||
|
|
||||||
// Entry point for additional threads
|
|
||||||
void* bli_l3_thread_entry( void* data_void )
|
|
||||||
{
|
|
||||||
thread_data_t* data = data_void;
|
|
||||||
|
|
||||||
l3int_t func = data->func;
|
|
||||||
opid_t family = data->family;
|
|
||||||
pack_t schema_a = data->schema_a;
|
|
||||||
pack_t schema_b = data->schema_b;
|
|
||||||
obj_t* alpha = data->alpha;
|
|
||||||
obj_t* a = data->a;
|
|
||||||
obj_t* b = data->b;
|
|
||||||
obj_t* beta = data->beta;
|
|
||||||
obj_t* c = data->c;
|
|
||||||
cntx_t* cntx = data->cntx;
|
|
||||||
rntm_t* rntm = data->rntm;
|
|
||||||
cntl_t* cntl = data->cntl;
|
|
||||||
dim_t tid = data->tid;
|
|
||||||
array_t* array = data->array;
|
|
||||||
thrcomm_t* gl_comm = data->gl_comm;
|
|
||||||
|
|
||||||
// Create a thread-local copy of the master thread's rntm_t. This is
|
|
||||||
// necessary since we want each thread to be able to track its own
|
|
||||||
// small block pool_t as it executes down the function stack.
|
|
||||||
rntm_t rntm_l = *rntm;
|
|
||||||
rntm_t* restrict rntm_p = &rntm_l;
|
|
||||||
|
|
||||||
// Use the thread id to access the appropriate pool_t* within the
|
|
||||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
|
||||||
// If the pool_t* element within the array_t is NULL, it will first
|
|
||||||
// be allocated/initialized.
|
|
||||||
bli_sba_rntm_set_pool( tid, array, rntm_p );
|
|
||||||
|
|
||||||
obj_t a_t, b_t, c_t;
|
|
||||||
cntl_t* cntl_use;
|
|
||||||
thrinfo_t* thread;
|
|
||||||
|
|
||||||
// Alias thread-local copies of A, B, and C. These will be the objects
|
|
||||||
// we pass down the algorithmic function stack. Making thread-local
|
|
||||||
// alaises is highly recommended in case a thread needs to change any
|
|
||||||
// of the properties of an object without affecting other threads'
|
|
||||||
// objects.
|
|
||||||
bli_obj_alias_to( a, &a_t );
|
|
||||||
bli_obj_alias_to( b, &b_t );
|
|
||||||
bli_obj_alias_to( c, &c_t );
|
|
||||||
|
|
||||||
// Create a default control tree for the operation, if needed.
|
|
||||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
|
||||||
&a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
|
|
||||||
|
|
||||||
// Create the root node of the current thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
|
||||||
|
|
||||||
func
|
|
||||||
(
|
|
||||||
alpha,
|
|
||||||
&a_t,
|
|
||||||
&b_t,
|
|
||||||
beta,
|
|
||||||
&c_t,
|
|
||||||
cntx,
|
|
||||||
rntm_p,
|
|
||||||
cntl_use,
|
|
||||||
thread
|
|
||||||
);
|
|
||||||
|
|
||||||
// Free the thread's local control tree.
|
|
||||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
|
||||||
|
|
||||||
// Free the current thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_free( rntm_p, thread );
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void bli_l3_thread_decorator
|
|
||||||
(
|
|
||||||
l3int_t func,
|
|
||||||
opid_t family,
|
|
||||||
obj_t* alpha,
|
|
||||||
obj_t* a,
|
|
||||||
obj_t* b,
|
|
||||||
obj_t* beta,
|
|
||||||
obj_t* c,
|
|
||||||
cntx_t* cntx,
|
|
||||||
rntm_t* rntm,
|
|
||||||
cntl_t* cntl
|
|
||||||
)
|
|
||||||
{
|
|
||||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
|
||||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
|
||||||
// we decided to transmit them via the schema field in the obj_t's
|
|
||||||
// rather than pass them in as function parameters. Once the values
|
|
||||||
// have been read, we immediately reset them back to their expected
|
|
||||||
// values for unpacked objects.
|
|
||||||
pack_t schema_a = bli_obj_pack_schema( a );
|
|
||||||
pack_t schema_b = bli_obj_pack_schema( b );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
|
||||||
|
|
||||||
// Query the total number of threads from the context.
|
|
||||||
const dim_t n_threads = bli_rntm_num_threads( rntm );
|
|
||||||
|
|
||||||
// NOTE: The sba was initialized in bli_init().
|
|
||||||
|
|
||||||
// Check out an array_t from the small block allocator. This is done
|
|
||||||
// with an internal lock to ensure only one application thread accesses
|
|
||||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
|
||||||
// resize the array_t, if necessary.
|
|
||||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
|
||||||
|
|
||||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
|
||||||
// this up-front only so that we have the rntm_t.sba_pool field
|
|
||||||
// initialized and ready for the global communicator creation below.
|
|
||||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
|
||||||
|
|
||||||
// Set the packing block allocator field of the rntm. This will be
|
|
||||||
// inherited by all of the child threads when they make local copies of
|
|
||||||
// the rntm below.
|
|
||||||
bli_membrk_rntm_set_membrk( rntm );
|
|
||||||
|
|
||||||
// Allocate a global communicator for the root thrinfo_t structures.
|
|
||||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
|
||||||
|
|
||||||
// Allocate an array of pthread objects and auxiliary data structs to pass
|
|
||||||
// to the thread entry functions.
|
|
||||||
|
|
||||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
|
||||||
printf( "bli_l3_thread_decorator().pth: " );
|
|
||||||
#endif
|
|
||||||
bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads );
|
|
||||||
|
|
||||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
|
||||||
printf( "bli_l3_thread_decorator().pth: " );
|
|
||||||
#endif
|
|
||||||
thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads );
|
|
||||||
|
|
||||||
// NOTE: We must iterate backwards so that the chief thread (thread id 0)
|
|
||||||
// can spawn all other threads before proceeding with its own computation.
|
|
||||||
for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- )
|
|
||||||
{
|
|
||||||
// Set up thread data for additional threads (beyond thread 0).
|
|
||||||
datas[tid].func = func;
|
|
||||||
datas[tid].family = family;
|
|
||||||
datas[tid].schema_a = schema_a;
|
|
||||||
datas[tid].schema_b = schema_b;
|
|
||||||
datas[tid].alpha = alpha;
|
|
||||||
datas[tid].a = a;
|
|
||||||
datas[tid].b = b;
|
|
||||||
datas[tid].beta = beta;
|
|
||||||
datas[tid].c = c;
|
|
||||||
datas[tid].cntx = cntx;
|
|
||||||
datas[tid].rntm = rntm;
|
|
||||||
datas[tid].cntl = cntl;
|
|
||||||
datas[tid].tid = tid;
|
|
||||||
datas[tid].gl_comm = gl_comm;
|
|
||||||
datas[tid].array = array;
|
|
||||||
|
|
||||||
// Spawn additional threads for ids greater than 1.
|
|
||||||
if ( tid != 0 )
|
|
||||||
bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] );
|
|
||||||
else
|
|
||||||
bli_l3_thread_entry( ( void* )(&datas[0]) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// We shouldn't free the global communicator since it was already freed
|
|
||||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
|
||||||
// (called from the thread entry function).
|
|
||||||
|
|
||||||
// Thread 0 waits for additional threads to finish.
|
|
||||||
for ( dim_t tid = 1; tid < n_threads; tid++ )
|
|
||||||
{
|
|
||||||
bli_pthread_join( pthreads[tid], NULL );
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check the array_t back into the small block allocator. Similar to the
|
|
||||||
// check-out, this is done using a lock embedded within the sba to ensure
|
|
||||||
// mutual exclusion.
|
|
||||||
bli_sba_checkin_array( array );
|
|
||||||
|
|
||||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
|
||||||
printf( "bli_l3_thread_decorator().pth: " );
|
|
||||||
#endif
|
|
||||||
bli_free_intl( pthreads );
|
|
||||||
|
|
||||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
|
||||||
printf( "bli_l3_thread_decorator().pth: " );
|
|
||||||
#endif
|
|
||||||
bli_free_intl( datas );
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -84,119 +84,5 @@ void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm )
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define a dummy function bli_l3_thread_entry(), which is needed in the
|
|
||||||
// pthreads version, so that when building Windows DLLs (with OpenMP enabled
|
|
||||||
// or no multithreading) we don't risk having an unresolved symbol.
|
|
||||||
void* bli_l3_thread_entry( void* data_void ) { return NULL; }
|
|
||||||
|
|
||||||
void bli_l3_thread_decorator
|
|
||||||
(
|
|
||||||
l3int_t func,
|
|
||||||
opid_t family,
|
|
||||||
obj_t* alpha,
|
|
||||||
obj_t* a,
|
|
||||||
obj_t* b,
|
|
||||||
obj_t* beta,
|
|
||||||
obj_t* c,
|
|
||||||
cntx_t* cntx,
|
|
||||||
rntm_t* rntm,
|
|
||||||
cntl_t* cntl
|
|
||||||
)
|
|
||||||
{
|
|
||||||
// This is part of a hack to support mixed domain in bli_gemm_front().
|
|
||||||
// Sometimes we need to specify a non-standard schema for A and B, and
|
|
||||||
// we decided to transmit them via the schema field in the obj_t's
|
|
||||||
// rather than pass them in as function parameters. Once the values
|
|
||||||
// have been read, we immediately reset them back to their expected
|
|
||||||
// values for unpacked objects.
|
|
||||||
pack_t schema_a = bli_obj_pack_schema( a );
|
|
||||||
pack_t schema_b = bli_obj_pack_schema( b );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
|
|
||||||
bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
|
|
||||||
|
|
||||||
// For sequential execution, we use only one thread.
|
|
||||||
const dim_t n_threads = 1;
|
|
||||||
|
|
||||||
// NOTE: The sba was initialized in bli_init().
|
|
||||||
|
|
||||||
// Check out an array_t from the small block allocator. This is done
|
|
||||||
// with an internal lock to ensure only one application thread accesses
|
|
||||||
// the sba at a time. bli_sba_checkout_array() will also automatically
|
|
||||||
// resize the array_t, if necessary.
|
|
||||||
array_t* restrict array = bli_sba_checkout_array( n_threads );
|
|
||||||
|
|
||||||
// Access the pool_t* for thread 0 and embed it into the rntm. We do
|
|
||||||
// this up-front only so that we can create the global comm below.
|
|
||||||
bli_sba_rntm_set_pool( 0, array, rntm );
|
|
||||||
|
|
||||||
// Set the packing block allocator field of the rntm.
|
|
||||||
bli_membrk_rntm_set_membrk( rntm );
|
|
||||||
|
|
||||||
// Allcoate a global communicator for the root thrinfo_t structures.
|
|
||||||
thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
|
|
||||||
|
|
||||||
|
|
||||||
{
|
|
||||||
// NOTE: We don't need to create another copy of the rntm_t since
|
|
||||||
// it was already copied in one of the high-level oapi functions.
|
|
||||||
rntm_t* restrict rntm_p = rntm;
|
|
||||||
|
|
||||||
cntl_t* cntl_use;
|
|
||||||
thrinfo_t* thread;
|
|
||||||
|
|
||||||
const dim_t tid = 0;
|
|
||||||
|
|
||||||
// Use the thread id to access the appropriate pool_t* within the
|
|
||||||
// array_t, and use it to set the sba_pool field within the rntm_t.
|
|
||||||
// If the pool_t* element within the array_t is NULL, it will first
|
|
||||||
// be allocated/initialized.
|
|
||||||
// NOTE: This is commented out because, in the single-threaded case,
|
|
||||||
// this is redundant since it's already been done above.
|
|
||||||
//bli_sba_rntm_set_pool( tid, array, rntm_p );
|
|
||||||
|
|
||||||
// NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't
|
|
||||||
// need to alias objects for A, B, and C since they were already aliased
|
|
||||||
// in bli_*_front(). However, we may add aliasing here in the future so
|
|
||||||
// that, with all three (_single.c, _openmp.c, _pthreads.c) implementations
|
|
||||||
// consistently providing local aliases, we can then eliminate aliasing
|
|
||||||
// elsewhere.
|
|
||||||
|
|
||||||
// Create a default control tree for the operation, if needed.
|
|
||||||
bli_l3_cntl_create_if( family, schema_a, schema_b,
|
|
||||||
a, b, c, rntm_p, cntl, &cntl_use );
|
|
||||||
|
|
||||||
// Create the root node of the thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
|
|
||||||
|
|
||||||
func
|
|
||||||
(
|
|
||||||
alpha,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
beta,
|
|
||||||
c,
|
|
||||||
cntx,
|
|
||||||
rntm_p,
|
|
||||||
cntl_use,
|
|
||||||
thread
|
|
||||||
);
|
|
||||||
|
|
||||||
// Free the thread's local control tree.
|
|
||||||
bli_l3_cntl_free( rntm_p, cntl_use, thread );
|
|
||||||
|
|
||||||
// Free the current thread's thrinfo_t structure.
|
|
||||||
bli_l3_thrinfo_free( rntm_p, thread );
|
|
||||||
}
|
|
||||||
|
|
||||||
// We shouldn't free the global communicator since it was already freed
|
|
||||||
// by the global communicator's chief thread in bli_l3_thrinfo_free()
|
|
||||||
// (called above).
|
|
||||||
|
|
||||||
// Check the array_t back into the small block allocator. Similar to the
|
|
||||||
// check-out, this is done using a lock embedded within the sba to ensure
|
|
||||||
// mutual exclusion.
|
|
||||||
bli_sba_checkin_array( array );
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -39,8 +39,12 @@ thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
|
|||||||
thrinfo_t BLIS_GEMM_SINGLE_THREADED = {};
|
thrinfo_t BLIS_GEMM_SINGLE_THREADED = {};
|
||||||
thrcomm_t BLIS_SINGLE_COMM = {};
|
thrcomm_t BLIS_SINGLE_COMM = {};
|
||||||
|
|
||||||
// The global rntm_t structure, which holds the global thread settings.
|
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
|
||||||
static rntm_t global_rntm;
|
extern rntm_t global_rntm;
|
||||||
|
|
||||||
|
// A mutex to allow synchronous access to global_rntm. (The definition
|
||||||
|
// resides in bli_rntm.c.)
|
||||||
|
extern bli_pthread_mutex_t global_rntm_mutex;
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -1188,63 +1192,6 @@ dim_t bli_ipow( dim_t base, dim_t power )
|
|||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
// -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
dim_t bli_thread_get_env( const char* env, dim_t fallback )
|
|
||||||
{
|
|
||||||
dim_t r_val;
|
|
||||||
char* str;
|
|
||||||
|
|
||||||
// Query the environment variable and store the result in str.
|
|
||||||
str = getenv( env );
|
|
||||||
|
|
||||||
// Set the return value based on the string obtained from getenv().
|
|
||||||
if ( str != NULL )
|
|
||||||
{
|
|
||||||
// If there was no error, convert the string to an integer and
|
|
||||||
// prepare to return that integer.
|
|
||||||
r_val = strtol( str, NULL, 10 );
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// If there was an error, use the "fallback" as the return value.
|
|
||||||
r_val = fallback;
|
|
||||||
}
|
|
||||||
|
|
||||||
return r_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
void bli_thread_set_env( const char* env, dim_t value )
|
|
||||||
{
|
|
||||||
dim_t r_val;
|
|
||||||
char value_str[32];
|
|
||||||
const char* fs_32 = "%u";
|
|
||||||
const char* fs_64 = "%lu";
|
|
||||||
|
|
||||||
// Convert the string to an integer, but vary the format specifier
|
|
||||||
// depending on the integer type size.
|
|
||||||
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
|
|
||||||
else sprintf( value_str, fs_64, value );
|
|
||||||
|
|
||||||
// Set the environment variable using the string we just wrote to via
|
|
||||||
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
|
|
||||||
// value if the environment variable already exists.)
|
|
||||||
r_val = bli_setenv( env, value_str, TRUE );
|
|
||||||
|
|
||||||
// Check the return value in case something went horribly wrong.
|
|
||||||
if ( r_val == -1 )
|
|
||||||
{
|
|
||||||
char err_str[128];
|
|
||||||
|
|
||||||
// Query the human-readable error string corresponding to errno.
|
|
||||||
strerror_r( errno, err_str, 128 );
|
|
||||||
|
|
||||||
// Print the error message.
|
|
||||||
bli_print_msg( err_str, __FILE__, __LINE__ );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -1298,9 +1245,6 @@ dim_t bli_thread_get_num_threads( void )
|
|||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
// A mutex to allow synchronous access to global_rntm.
|
|
||||||
static bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
|
|
||||||
|
|
||||||
void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
|
void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
|
||||||
{
|
{
|
||||||
// We must ensure that global_rntm has been initialized.
|
// We must ensure that global_rntm has been initialized.
|
||||||
@@ -1331,22 +1275,6 @@ void bli_thread_set_num_threads( dim_t n_threads )
|
|||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
// ----------------------------------------------------------------------------
|
||||||
|
|
||||||
void bli_thread_init_rntm( rntm_t* rntm )
|
|
||||||
{
|
|
||||||
// We must ensure that global_rntm has been initialized.
|
|
||||||
bli_init_once();
|
|
||||||
|
|
||||||
// Acquire the mutex protecting global_rntm.
|
|
||||||
bli_pthread_mutex_lock( &global_rntm_mutex );
|
|
||||||
|
|
||||||
*rntm = global_rntm;
|
|
||||||
|
|
||||||
// Release the mutex protecting global_rntm.
|
|
||||||
bli_pthread_mutex_unlock( &global_rntm_mutex );
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
void bli_thread_init_rntm_from_env
|
void bli_thread_init_rntm_from_env
|
||||||
(
|
(
|
||||||
rntm_t* rntm
|
rntm_t* rntm
|
||||||
@@ -1362,19 +1290,19 @@ void bli_thread_init_rntm_from_env
|
|||||||
#ifdef BLIS_ENABLE_MULTITHREADING
|
#ifdef BLIS_ENABLE_MULTITHREADING
|
||||||
|
|
||||||
// Try to read BLIS_NUM_THREADS first.
|
// Try to read BLIS_NUM_THREADS first.
|
||||||
nt = bli_thread_get_env( "BLIS_NUM_THREADS", -1 );
|
nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 );
|
||||||
|
|
||||||
// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
|
// If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS.
|
||||||
if ( nt == -1 )
|
if ( nt == -1 )
|
||||||
nt = bli_thread_get_env( "OMP_NUM_THREADS", -1 );
|
nt = bli_env_get_var( "OMP_NUM_THREADS", -1 );
|
||||||
|
|
||||||
// Read the environment variables for the number of threads (ways
|
// Read the environment variables for the number of threads (ways
|
||||||
// of parallelism) for each individual loop.
|
// of parallelism) for each individual loop.
|
||||||
jc = bli_thread_get_env( "BLIS_JC_NT", -1 );
|
jc = bli_env_get_var( "BLIS_JC_NT", -1 );
|
||||||
pc = bli_thread_get_env( "BLIS_PC_NT", -1 );
|
pc = bli_env_get_var( "BLIS_PC_NT", -1 );
|
||||||
ic = bli_thread_get_env( "BLIS_IC_NT", -1 );
|
ic = bli_env_get_var( "BLIS_IC_NT", -1 );
|
||||||
jr = bli_thread_get_env( "BLIS_JR_NT", -1 );
|
jr = bli_env_get_var( "BLIS_JR_NT", -1 );
|
||||||
ir = bli_thread_get_env( "BLIS_IR_NT", -1 );
|
ir = bli_env_get_var( "BLIS_IR_NT", -1 );
|
||||||
|
|
||||||
// If any BLIS_*_NT environment variable was set, then we ignore the
|
// If any BLIS_*_NT environment variable was set, then we ignore the
|
||||||
// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
|
// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
|
||||||
|
|||||||
@@ -48,6 +48,14 @@
|
|||||||
#include "bli_packm_thrinfo.h"
|
#include "bli_packm_thrinfo.h"
|
||||||
#include "bli_l3_thrinfo.h"
|
#include "bli_l3_thrinfo.h"
|
||||||
|
|
||||||
|
// Include the level-3 thread decorator and related definitions and prototypes
|
||||||
|
// for the conventional code path.
|
||||||
|
#include "bli_l3_decor.h"
|
||||||
|
|
||||||
|
// Include the level-3 thread decorator and related definitions and prototypes
|
||||||
|
// for the sup code path.
|
||||||
|
#include "bli_l3_sup_decor.h"
|
||||||
|
|
||||||
// Initialization-related prototypes.
|
// Initialization-related prototypes.
|
||||||
void bli_thread_init( void );
|
void bli_thread_init( void );
|
||||||
void bli_thread_finalize( void );
|
void bli_thread_finalize( void );
|
||||||
@@ -141,37 +149,6 @@ siz_t bli_thread_range_weighted_sub
|
|||||||
dim_t* restrict j_end_thr
|
dim_t* restrict j_end_thr
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Level-3 internal function type
|
|
||||||
typedef void (*l3int_t)
|
|
||||||
(
|
|
||||||
obj_t* alpha,
|
|
||||||
obj_t* a,
|
|
||||||
obj_t* b,
|
|
||||||
obj_t* beta,
|
|
||||||
obj_t* c,
|
|
||||||
cntx_t* cntx,
|
|
||||||
rntm_t* rntm,
|
|
||||||
cntl_t* cntl,
|
|
||||||
thrinfo_t* thread
|
|
||||||
);
|
|
||||||
|
|
||||||
// Level-3 thread decorator prototype
|
|
||||||
void bli_l3_thread_decorator
|
|
||||||
(
|
|
||||||
l3int_t func,
|
|
||||||
opid_t family,
|
|
||||||
obj_t* alpha,
|
|
||||||
obj_t* a,
|
|
||||||
obj_t* b,
|
|
||||||
obj_t* beta,
|
|
||||||
obj_t* c,
|
|
||||||
cntx_t* cntx,
|
|
||||||
rntm_t* rntm,
|
|
||||||
cntl_t* cntl
|
|
||||||
);
|
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
// Factorization and partitioning prototypes
|
// Factorization and partitioning prototypes
|
||||||
@@ -196,9 +173,6 @@ dim_t bli_ipow( dim_t base, dim_t power );
|
|||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_env( const char* env, dim_t fallback );
|
|
||||||
//void bli_thread_set_env( const char* env, dim_t value );
|
|
||||||
|
|
||||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
|
BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void );
|
||||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
|
BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void );
|
||||||
BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );
|
BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void );
|
||||||
@@ -209,8 +183,6 @@ BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void );
|
|||||||
BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
|
BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir );
|
||||||
BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value );
|
BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value );
|
||||||
|
|
||||||
BLIS_EXPORT_BLIS void bli_thread_init_rntm( rntm_t* rntm );
|
|
||||||
|
|
||||||
void bli_thread_init_rntm_from_env( rntm_t* rntm );
|
void bli_thread_init_rntm_from_env( rntm_t* rntm );
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -54,6 +54,12 @@
|
|||||||
Therefore, this (r)ow-preferential microkernel is well-suited for
|
Therefore, this (r)ow-preferential microkernel is well-suited for
|
||||||
a dot-product-based accumulation that performs vector loads from
|
a dot-product-based accumulation that performs vector loads from
|
||||||
both A and B.
|
both A and B.
|
||||||
|
|
||||||
|
NOTE: These kernels implicitly support column-oriented IO, implemented
|
||||||
|
via an a high-level transposition of the entire operation. A and B will
|
||||||
|
effectively remain row- and column-stored, respectively, but C will then
|
||||||
|
effectively appear column-stored. Thus, this kernel may be used for both
|
||||||
|
rrc and crc cases.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Prototype reference microkernels.
|
// Prototype reference microkernels.
|
||||||
|
|||||||
@@ -54,6 +54,12 @@
|
|||||||
Therefore, this (r)ow-preferential microkernel is well-suited for
|
Therefore, this (r)ow-preferential microkernel is well-suited for
|
||||||
a dot-product-based accumulation that performs vector loads from
|
a dot-product-based accumulation that performs vector loads from
|
||||||
both A and B.
|
both A and B.
|
||||||
|
|
||||||
|
NOTE: These kernels implicitly support column-oriented IO, implemented
|
||||||
|
via an a high-level transposition of the entire operation. A and B will
|
||||||
|
effectively remain row- and column-stored, respectively, but C will then
|
||||||
|
effectively appear column-stored. Thus, this kernel may be used for both
|
||||||
|
rrc and crc cases.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Prototype reference microkernels.
|
// Prototype reference microkernels.
|
||||||
|
|||||||
@@ -172,12 +172,44 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
|||||||
beta, cij, rs_c0, cs_c0, data, cntx
|
beta, cij, rs_c0, cs_c0, data, cntx
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
bli_dgemv_ex
|
dim_t ps_a0 = bli_auxinfo_ps_a( data );
|
||||||
(
|
|
||||||
BLIS_NO_TRANSPOSE, conjb, m0, k0,
|
if ( ps_a0 == 6 * rs_a0 )
|
||||||
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
|
{
|
||||||
beta, cij, rs_c0, cntx, NULL
|
// Since A is not packed, we can use one gemv.
|
||||||
);
|
bli_dgemv_ex
|
||||||
|
(
|
||||||
|
BLIS_NO_TRANSPOSE, conjb, m0, k0,
|
||||||
|
alpha, ai, rs_a0, cs_a0, bj, rs_b0,
|
||||||
|
beta, cij, rs_c0, cntx, NULL
|
||||||
|
);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const dim_t mr = 6;
|
||||||
|
|
||||||
|
// Since A is packed into row panels, we must use a loop over
|
||||||
|
// gemv.
|
||||||
|
dim_t m_iter = ( m0 + mr - 1 ) / mr;
|
||||||
|
dim_t m_left = m0 % mr;
|
||||||
|
|
||||||
|
double* restrict ai_ii = ai;
|
||||||
|
double* restrict cij_ii = cij;
|
||||||
|
|
||||||
|
for ( dim_t ii = 0; ii < m_iter; ii += 1 )
|
||||||
|
{
|
||||||
|
dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
|
||||||
|
? mr : m_left );
|
||||||
|
|
||||||
|
bli_dgemv_ex
|
||||||
|
(
|
||||||
|
BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
|
||||||
|
alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
|
||||||
|
beta, cij_ii, rs_c0, cntx, NULL
|
||||||
|
);
|
||||||
|
cij_ii += mr*rs_c0; ai_ii += ps_a0;
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@@ -201,6 +233,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of A and convert it to units of bytes.
|
||||||
|
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||||
|
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||||
|
|
||||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -836,8 +872,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
|||||||
lea(mem(r12, rdi, 4), r12) //
|
lea(mem(r12, rdi, 4), r12) //
|
||||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||||
|
|
||||||
lea(mem(r14, r8, 4), r14) //
|
//lea(mem(r14, r8, 4), r14) //
|
||||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||||
|
mov(var(ps_a8), rax) // load ps_a8
|
||||||
|
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||||
|
|
||||||
dec(r11) // ii -= 1;
|
dec(r11) // ii -= 1;
|
||||||
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
||||||
@@ -858,6 +896,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
|||||||
[a] "m" (a),
|
[a] "m" (a),
|
||||||
[rs_a] "m" (rs_a),
|
[rs_a] "m" (rs_a),
|
||||||
[cs_a] "m" (cs_a),
|
[cs_a] "m" (cs_a),
|
||||||
|
[ps_a8] "m" (ps_a8),
|
||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
@@ -887,7 +926,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
|
|||||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||||
|
|
||||||
double* restrict cij = c + i_edge*rs_c;
|
double* restrict cij = c + i_edge*rs_c;
|
||||||
double* restrict ai = a + i_edge*rs_a;
|
//double* restrict ai = a + i_edge*rs_a;
|
||||||
|
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||||
|
double* restrict ai = a + m_iter * ps_a;
|
||||||
double* restrict bj = b;
|
double* restrict bj = b;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@@ -1056,6 +1097,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of A and convert it to units of bytes.
|
||||||
|
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||||
|
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||||
|
|
||||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -1689,8 +1734,10 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
|||||||
lea(mem(r12, rdi, 4), r12) //
|
lea(mem(r12, rdi, 4), r12) //
|
||||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||||
|
|
||||||
lea(mem(r14, r8, 4), r14) //
|
//lea(mem(r14, r8, 4), r14) //
|
||||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||||
|
mov(var(ps_a8), rax) // load ps_a8
|
||||||
|
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||||
|
|
||||||
dec(r11) // ii -= 1;
|
dec(r11) // ii -= 1;
|
||||||
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
jne(.DLOOP6X8I) // iterate again if ii != 0.
|
||||||
@@ -1711,6 +1758,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
|||||||
[a] "m" (a),
|
[a] "m" (a),
|
||||||
[rs_a] "m" (rs_a),
|
[rs_a] "m" (rs_a),
|
||||||
[cs_a] "m" (cs_a),
|
[cs_a] "m" (cs_a),
|
||||||
|
[ps_a8] "m" (ps_a8),
|
||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
@@ -1740,7 +1788,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
|
|||||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||||
|
|
||||||
double* restrict cij = c + i_edge*rs_c;
|
double* restrict cij = c + i_edge*rs_c;
|
||||||
double* restrict ai = a + i_edge*rs_a;
|
//double* restrict ai = a + i_edge*rs_a;
|
||||||
|
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||||
|
double* restrict ai = a + m_iter * ps_a;
|
||||||
double* restrict bj = b;
|
double* restrict bj = b;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@@ -1909,6 +1959,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of A and convert it to units of bytes.
|
||||||
|
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||||
|
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||||
|
|
||||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -2396,8 +2450,10 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
|||||||
lea(mem(r12, rdi, 4), r12) //
|
lea(mem(r12, rdi, 4), r12) //
|
||||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||||
|
|
||||||
lea(mem(r14, r8, 4), r14) //
|
//lea(mem(r14, r8, 4), r14) //
|
||||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||||
|
mov(var(ps_a8), rax) // load ps_a8
|
||||||
|
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||||
|
|
||||||
dec(r11) // ii -= 1;
|
dec(r11) // ii -= 1;
|
||||||
jne(.DLOOP6X4I) // iterate again if ii != 0.
|
jne(.DLOOP6X4I) // iterate again if ii != 0.
|
||||||
@@ -2418,6 +2474,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
|||||||
[a] "m" (a),
|
[a] "m" (a),
|
||||||
[rs_a] "m" (rs_a),
|
[rs_a] "m" (rs_a),
|
||||||
[cs_a] "m" (cs_a),
|
[cs_a] "m" (cs_a),
|
||||||
|
[ps_a8] "m" (ps_a8),
|
||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
@@ -2447,7 +2504,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
|
|||||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||||
|
|
||||||
double* restrict cij = c + i_edge*rs_c;
|
double* restrict cij = c + i_edge*rs_c;
|
||||||
double* restrict ai = a + i_edge*rs_a;
|
//double* restrict ai = a + i_edge*rs_a;
|
||||||
|
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||||
|
double* restrict ai = a + m_iter * ps_a;
|
||||||
double* restrict bj = b;
|
double* restrict bj = b;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@@ -2616,6 +2675,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of A and convert it to units of bytes.
|
||||||
|
uint64_t ps_a = bli_auxinfo_ps_a( data );
|
||||||
|
uint64_t ps_a8 = ps_a * sizeof( double );
|
||||||
|
|
||||||
if ( m_iter == 0 ) goto consider_edge_cases;
|
if ( m_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -3077,8 +3140,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
|||||||
lea(mem(r12, rdi, 4), r12) //
|
lea(mem(r12, rdi, 4), r12) //
|
||||||
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c
|
||||||
|
|
||||||
lea(mem(r14, r8, 4), r14) //
|
//lea(mem(r14, r8, 4), r14) //
|
||||||
lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
//lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a
|
||||||
|
mov(var(ps_a8), rax) // load ps_a8
|
||||||
|
lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8
|
||||||
|
|
||||||
dec(r11) // ii -= 1;
|
dec(r11) // ii -= 1;
|
||||||
jne(.DLOOP6X2I) // iterate again if ii != 0.
|
jne(.DLOOP6X2I) // iterate again if ii != 0.
|
||||||
@@ -3099,6 +3164,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
|||||||
[a] "m" (a),
|
[a] "m" (a),
|
||||||
[rs_a] "m" (rs_a),
|
[rs_a] "m" (rs_a),
|
||||||
[cs_a] "m" (cs_a),
|
[cs_a] "m" (cs_a),
|
||||||
|
[ps_a8] "m" (ps_a8),
|
||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
@@ -3128,7 +3194,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
|
|||||||
const dim_t i_edge = m0 - ( dim_t )m_left;
|
const dim_t i_edge = m0 - ( dim_t )m_left;
|
||||||
|
|
||||||
double* restrict cij = c + i_edge*rs_c;
|
double* restrict cij = c + i_edge*rs_c;
|
||||||
double* restrict ai = a + i_edge*rs_a;
|
//double* restrict ai = a + i_edge*rs_a;
|
||||||
|
//double* restrict ai = a + ( i_edge / 6 ) * ps_a;
|
||||||
|
double* restrict ai = a + m_iter * ps_a;
|
||||||
double* restrict bj = b;
|
double* restrict bj = b;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|||||||
@@ -293,6 +293,10 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -923,7 +927,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP6X8J) // iterate again if jj != 0.
|
jne(.DLOOP6X8J) // iterate again if jj != 0.
|
||||||
@@ -947,6 +953,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -974,7 +981,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
@@ -1057,6 +1066,10 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -1717,7 +1730,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP6X8J) // iterate again if jj != 0.
|
jne(.DLOOP6X8J) // iterate again if jj != 0.
|
||||||
@@ -1741,6 +1756,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -1768,7 +1784,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
@@ -1851,6 +1869,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -2344,7 +2366,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP4X8J) // iterate again if jj != 0.
|
jne(.DLOOP4X8J) // iterate again if jj != 0.
|
||||||
@@ -2368,6 +2392,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -2395,7 +2420,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
@@ -2469,6 +2496,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -2993,7 +3024,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP4X8J) // iterate again if jj != 0.
|
jne(.DLOOP4X8J) // iterate again if jj != 0.
|
||||||
@@ -3017,6 +3050,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -3044,7 +3078,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
@@ -3118,6 +3154,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -3522,7 +3562,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP2X8J) // iterate again if jj != 0.
|
jne(.DLOOP2X8J) // iterate again if jj != 0.
|
||||||
@@ -3546,6 +3588,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -3573,7 +3616,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
@@ -3647,6 +3692,10 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
|||||||
uint64_t rs_c = rs_c0;
|
uint64_t rs_c = rs_c0;
|
||||||
uint64_t cs_c = cs_c0;
|
uint64_t cs_c = cs_c0;
|
||||||
|
|
||||||
|
// Query the panel stride of B and convert it to units of bytes.
|
||||||
|
uint64_t ps_b = bli_auxinfo_ps_b( data );
|
||||||
|
uint64_t ps_b8 = ps_b * sizeof( double );
|
||||||
|
|
||||||
if ( n_iter == 0 ) goto consider_edge_cases;
|
if ( n_iter == 0 ) goto consider_edge_cases;
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
@@ -4015,7 +4064,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
|||||||
|
|
||||||
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c
|
||||||
|
|
||||||
add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
//add(imm(8*8), r14) // b_jj = r14 += 8*cs_b
|
||||||
|
mov(var(ps_b8), rbx) // load ps_b8
|
||||||
|
lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b8
|
||||||
|
|
||||||
dec(r11) // jj -= 1;
|
dec(r11) // jj -= 1;
|
||||||
jne(.DLOOP1X8J) // iterate again if jj != 0.
|
jne(.DLOOP1X8J) // iterate again if jj != 0.
|
||||||
@@ -4039,6 +4090,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
|||||||
[b] "m" (b),
|
[b] "m" (b),
|
||||||
[rs_b] "m" (rs_b),
|
[rs_b] "m" (rs_b),
|
||||||
[cs_b] "m" (cs_b),
|
[cs_b] "m" (cs_b),
|
||||||
|
[ps_b8] "m" (ps_b8),
|
||||||
[alpha] "m" (alpha),
|
[alpha] "m" (alpha),
|
||||||
[beta] "m" (beta),
|
[beta] "m" (beta),
|
||||||
[c] "m" (c),
|
[c] "m" (c),
|
||||||
@@ -4066,7 +4118,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
|
|||||||
|
|
||||||
double* restrict cij = c + j_edge*cs_c;
|
double* restrict cij = c + j_edge*cs_c;
|
||||||
double* restrict ai = a;
|
double* restrict ai = a;
|
||||||
double* restrict bj = b + j_edge*cs_b;
|
//double* restrict bj = b + j_edge*cs_b;
|
||||||
|
//double* restrict bj = b + ( j_edge / 8 ) * ps_b;
|
||||||
|
double* restrict bj = b + n_iter * ps_b;
|
||||||
|
|
||||||
if ( 4 <= n_left )
|
if ( 4 <= n_left )
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -829,12 +829,12 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
|
|||||||
rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
|
rntm_t gemm, herk, trmm_l, trmm_r, trsm_l, trsm_r;
|
||||||
dim_t m = 1000, n = 1000, k = 1000;
|
dim_t m = 1000, n = 1000, k = 1000;
|
||||||
|
|
||||||
bli_thread_init_rntm( &gemm );
|
bli_rntm_init_from_global( &gemm );
|
||||||
bli_thread_init_rntm( &herk );
|
bli_rntm_init_from_global( &herk );
|
||||||
bli_thread_init_rntm( &trmm_l );
|
bli_rntm_init_from_global( &trmm_l );
|
||||||
bli_thread_init_rntm( &trmm_r );
|
bli_rntm_init_from_global( &trmm_r );
|
||||||
bli_thread_init_rntm( &trsm_l );
|
bli_rntm_init_from_global( &trsm_l );
|
||||||
bli_thread_init_rntm( &trsm_r );
|
bli_rntm_init_from_global( &trsm_r );
|
||||||
|
|
||||||
bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT, m, n, k, &gemm );
|
bli_rntm_set_ways_for_op( BLIS_GEMM, BLIS_LEFT, m, n, k, &gemm );
|
||||||
bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT, m, n, k, &herk );
|
bli_rntm_set_ways_for_op( BLIS_HERK, BLIS_LEFT, m, n, k, &herk );
|
||||||
|
|||||||
Reference in New Issue
Block a user