mirror of
https://github.com/amd/blis.git
synced 2026-04-24 17:48:50 +00:00
Added option of slab or rr partitioning in jr/ir.
Details:
- Updated existing macrokernel function names and definitions to
explicitly use slab assignment of micropanels to threads, then created
duplicate versions of macrokernels that explicitly use round-robin
assignment instead of slab. NOTE: As in ac18949, trsm_r macrokernels
were not substantially updated in this commit because they are
currently disabled in bli_trsm_front.c.
- Updated existing packing function (in blk_packm_blk_var1.c) to
explicitly use slab partitioning, and then duplicated for round-robin.
- Updated control tree initialization to use the appropriate macrokernel
and packm function pointers depending on which method (slab or rr) was
enabled at configure-time.
- Updated configure script to accept new --thread-part-jrir=[slab|rr]
option (-m [slab|rr] for short), which allows the user to explicitly
request either slab or round-robin assignment (partitioning) of
micropanels to threads.
- Updated sandbox/ref99 according to above changes.
- Minor updates to build/add-copyright.py.
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -54,7 +55,28 @@ cntl_t* blx_gemmbp_cntl_create
|
||||
pack_t schema_b
|
||||
)
|
||||
{
|
||||
void* macro_kernel_p = blx_gemm_ker_var2;
|
||||
void* macro_kernel_fp;
|
||||
void* packa_fp;
|
||||
void* packb_fp;
|
||||
|
||||
#ifdef BLIS_ENABLE_JRIR_SLAB
|
||||
|
||||
// Use the function pointers to the macrokernels that use slab
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_fp = blx_gemm_ker_var2sl;
|
||||
|
||||
packa_fp = bli_packm_blk_var1sl;
|
||||
packb_fp = bli_packm_blk_var1sl;
|
||||
|
||||
#else // BLIS_ENABLE_JRIR_RR
|
||||
|
||||
// Use the function pointers to the macrokernels that use round-robin
|
||||
// assignment of micropanels to threads in the jr and ir loops.
|
||||
macro_kernel_fp = bli_gemm_ker_var2rr;
|
||||
|
||||
packa_fp = bli_packm_blk_var1rr;
|
||||
packb_fp = bli_packm_blk_var1rr;
|
||||
#endif
|
||||
|
||||
// Create two nodes for the macro-kernel.
|
||||
cntl_t* gemm_cntl_bu_ke = blx_gemm_cntl_create_node
|
||||
@@ -69,7 +91,7 @@ cntl_t* blx_gemmbp_cntl_create
|
||||
(
|
||||
family,
|
||||
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
|
||||
macro_kernel_p,
|
||||
macro_kernel_fp,
|
||||
gemm_cntl_bu_ke
|
||||
);
|
||||
|
||||
@@ -77,7 +99,7 @@ cntl_t* blx_gemmbp_cntl_create
|
||||
cntl_t* gemm_cntl_packa = blx_packm_cntl_create_node
|
||||
(
|
||||
blx_gemm_packa, // pack the left-hand operand
|
||||
bli_packm_blk_var1,
|
||||
packa_fp,
|
||||
BLIS_MR,
|
||||
BLIS_KR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
@@ -101,7 +123,7 @@ cntl_t* blx_gemmbp_cntl_create
|
||||
cntl_t* gemm_cntl_packb = blx_packm_cntl_create_node
|
||||
(
|
||||
blx_gemm_packb, // pack the right-hand operand
|
||||
bli_packm_blk_var1,
|
||||
packb_fp,
|
||||
BLIS_KR,
|
||||
BLIS_NR,
|
||||
FALSE, // do NOT invert diagonal
|
||||
|
||||
@@ -59,14 +59,14 @@ typedef void (*gemm_fp)
|
||||
// Function pointer array for datatype-specific functions.
|
||||
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
PASTECH2(blx_,s,gemm_ker_var2),
|
||||
PASTECH2(blx_,c,gemm_ker_var2),
|
||||
PASTECH2(blx_,d,gemm_ker_var2),
|
||||
PASTECH2(blx_,z,gemm_ker_var2)
|
||||
PASTECH2(blx_,s,gemm_ker_var2rr),
|
||||
PASTECH2(blx_,c,gemm_ker_var2rr),
|
||||
PASTECH2(blx_,d,gemm_ker_var2rr),
|
||||
PASTECH2(blx_,z,gemm_ker_var2rr)
|
||||
};
|
||||
|
||||
|
||||
void blx_gemm_ker_var2
|
||||
void blx_gemm_ker_var2rr
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
@@ -272,8 +272,8 @@ void PASTECH2(blx_,ch,varname) \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for each thrinfo_t node. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
@@ -302,11 +302,11 @@ void PASTECH2(blx_,ch,varname) \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -363,11 +363,11 @@ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c,
|
||||
}
|
||||
|
||||
#if 0
|
||||
GENTFUNC( float, s, gemm_ker_var2 )
|
||||
GENTFUNC( double, d, gemm_ker_var2 )
|
||||
GENTFUNC( scomplex, c, gemm_ker_var2 )
|
||||
GENTFUNC( dcomplex, z, gemm_ker_var2 )
|
||||
GENTFUNC( float, s, gemm_ker_var2rr )
|
||||
GENTFUNC( double, d, gemm_ker_var2rr )
|
||||
GENTFUNC( scomplex, c, gemm_ker_var2rr )
|
||||
GENTFUNC( dcomplex, z, gemm_ker_var2rr )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
|
||||
#endif
|
||||
|
||||
373
sandbox/ref99/vars/blx_gemm_ker_var2sl.c
Normal file
373
sandbox/ref99/vars/blx_gemm_ker_var2sl.c
Normal file
@@ -0,0 +1,373 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include "blix.h"
|
||||
|
||||
// Function pointer type for datatype-specific functions.
|
||||
typedef void (*gemm_fp)
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
// Function pointer array for datatype-specific functions.
|
||||
static gemm_fp ftypes[BLIS_NUM_FP_TYPES] =
|
||||
{
|
||||
PASTECH2(blx_,s,gemm_ker_var2sl),
|
||||
PASTECH2(blx_,c,gemm_ker_var2sl),
|
||||
PASTECH2(blx_,d,gemm_ker_var2sl),
|
||||
PASTECH2(blx_,z,gemm_ker_var2sl)
|
||||
};
|
||||
|
||||
|
||||
void blx_gemm_ker_var2sl
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
gemm_fp f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTECH2(blx_,ch,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for each thrinfo_t node. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
#if 0
|
||||
GENTFUNC( float, s, gemm_ker_var2sl )
|
||||
GENTFUNC( double, d, gemm_ker_var2sl )
|
||||
GENTFUNC( scomplex, c, gemm_ker_var2sl )
|
||||
GENTFUNC( dcomplex, z, gemm_ker_var2sl )
|
||||
#else
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,7 +58,8 @@ GENPROT( gemm_blk_var3 )
|
||||
GENPROT( gemm_packa )
|
||||
GENPROT( gemm_packb )
|
||||
|
||||
GENPROT( gemm_ker_var2 )
|
||||
GENPROT( gemm_ker_var2sl )
|
||||
GENPROT( gemm_ker_var2rr )
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
@@ -85,5 +87,6 @@ void PASTECH2(blx_,ch,varname) \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2 )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2sl )
|
||||
INSERT_GENTPROT_BASIC0( gemm_ker_var2rr )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user