mirror of
https://github.com/amd/blis.git
synced 2026-03-15 23:07:22 +00:00
Multithreading optimizations for l3 macrokernels.
Details:
- Adjusted the method by which micropanels are assigned to threads in
the 2nd (jr) and 1st (ir) loops around the microkernel to (mostly)
employ contiguous "slab" partitioning rather than interleaved (round
robin) partitioning. The new partitioning schemes and related details
for specific families of operations are listed below:
- gemm: slab partitioning.
- herk: slab partitioning for region corresponding to non-triangular
region of C; round robin partitioning for triangular region.
- trmm: slab partitioning for region corresponding to non-triangular
region of B; round robin partitioning for triangular region.
(NOTE: This affects both left- and right-side macrokernels:
trmm_ll, trmm_lu, trmm_rl, trmm_ru.)
- trsm: slab partitioning.
(NOTE: This only affects only left-side macrokernels trsm_ll,
trsm_lu; right-side macrokernels were not touched.)
Also note that the previous macrokernels were preserved inside of
the 'other' directory of each operation family directory (e.g.
frame/3/gemm/other, frame/3/herk/other, etc).
- Updated gemm macrokernel in sandbox/ref99 in light of above changes
and fixed a stale function pointer type in blx_gemm_int.c
(gemm_voft -> gemm_var_oft).
- Added standalone test drivers in test/3m4m for herk, trmm, and trsm
and minor changes to test/3m4m/Makefile.
- Updated the arguments and definitions of bli_*_get_next_[ab]_upanel()
and bli_trmm_?_?r_my_iter() macros defined in bli_l3_thrinfo.h.
- Renamed bli_thread_get_range*() APIs to bli_thread_range*().
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -38,24 +39,28 @@
|
||||
|
||||
// gemm
|
||||
|
||||
#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
// herk
|
||||
|
||||
#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
|
||||
#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
|
||||
#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
// trmm
|
||||
|
||||
#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
|
||||
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
|
||||
|
||||
#define bli_trmm_my_iter( index, thread ) \
|
||||
\
|
||||
( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
// trsm
|
||||
|
||||
#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
#define bli_trsm_my_iter( index, thread ) \
|
||||
\
|
||||
( index % thread->n_way == thread->work_id % thread->n_way )
|
||||
|
||||
//
|
||||
// thrinfo_t APIs specific to level-3 operations.
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -60,7 +61,7 @@ void bli_gemm_blk_var1
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_mdim
|
||||
bli_thread_range_mdim
|
||||
(
|
||||
direct, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -60,7 +61,7 @@ void bli_gemm_blk_var2
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_ndim
|
||||
bli_thread_range_ndim
|
||||
(
|
||||
direct, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -268,14 +269,27 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for each thrinfo_t node. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -290,7 +304,7 @@ void PASTEMAC(ch,varname) \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -300,12 +314,12 @@ void PASTEMAC(ch,varname) \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_inc = jr_num_threads; \
|
||||
dim_t ir_inc = ir_num_threads; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
@@ -295,11 +299,11 @@ void PASTEMAC(ch,varname) \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
|
||||
366
frame/3/gemm/other/bli_gemm_ker_var2.c
Normal file
366
frame/3/gemm/other/bli_gemm_ker_var2.c
Normal file
@@ -0,0 +1,366 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
|
||||
|
||||
|
||||
void bli_gemm_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// If 1m is being employed on a column- or row-stored matrix with a
|
||||
// real-valued beta, we can use the real domain macro-kernel, which
|
||||
// eliminates a little overhead associated with the 1m virtual
|
||||
// micro-kernel.
|
||||
#if 1
|
||||
if ( bli_is_1m_packed( schema_a ) )
|
||||
{
|
||||
bli_l3_ind_recast_1m_params
|
||||
(
|
||||
dt_exec,
|
||||
schema_a,
|
||||
c,
|
||||
m, n, k,
|
||||
pd_a, ps_a,
|
||||
pd_b, ps_b,
|
||||
rs_c, cs_c
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -279,17 +280,57 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the rectangular
|
||||
part of C, and the triangular portion. */ \
|
||||
dim_t n_iter_rct; \
|
||||
dim_t n_iter_tri; \
|
||||
\
|
||||
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of C does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the second set of
|
||||
loops. */ \
|
||||
n_iter_rct = n_iter; \
|
||||
n_iter_tri = 0; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of C does intersect the diagonal, compute the number of
|
||||
iterations in the rectangular region by dividing NR into the diagonal
|
||||
offset. Any remainder from this integer division is discarded, which
|
||||
is what we want. That is, we want the rectangular region to contain
|
||||
as many columns of whole microtiles as possible without including any
|
||||
microtiles that intersect the diagonal. The number of iterations in
|
||||
the triangular (or trapezoidal) region is computed as the remaining
|
||||
number of iterations in the n dimension. */ \
|
||||
n_iter_rct = diagoffc / NR; \
|
||||
n_iter_tri = n_iter - n_iter_rct; \
|
||||
} \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
|
||||
the initial rectangular region of C (if it exists). For both the
|
||||
rectangular and triangular regions, use contiguous assignment for the
|
||||
1st loop as well. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -304,7 +345,112 @@ void PASTEMAC(ch,varname) \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* No need to compute the diagonal offset for the rectangular
|
||||
region. */ \
|
||||
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly below the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly above the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no triangular region, then we're done. */ \
|
||||
if ( n_iter_tri == 0 ) return; \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in
|
||||
the 2nd loop for the remaining triangular region of C. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the triangular region
|
||||
by the number of iterations used for the rectangular region. */ \
|
||||
jr_start += n_iter_rct; \
|
||||
jr_end += n_iter_rct; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -317,12 +463,12 @@ void PASTEMAC(ch,varname) \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of C
|
||||
intersects the top edge of the panel, adjust the pointer to C and B
|
||||
and treat this case as if the diagonal offset were zero. */ \
|
||||
and treat this case as if the diagonal offset were zero.
|
||||
NOTE: It's possible that after this pruning that the diagonal offset
|
||||
is still positive (though it is guaranteed to be less than NR). */ \
|
||||
if ( diagoffc > 0 ) \
|
||||
{ \
|
||||
jp = diagoffc / NR; \
|
||||
@@ -279,17 +282,57 @@ void PASTEMAC(ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the triangular
|
||||
part of C, and the rectangular portion. */ \
|
||||
dim_t n_iter_tri; \
|
||||
dim_t n_iter_rct; \
|
||||
\
|
||||
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of C does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the first set of
|
||||
loops. */ \
|
||||
n_iter_tri = 0; \
|
||||
n_iter_rct = n_iter; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of C does intersect the diagonal, compute the number of
|
||||
iterations in the triangular (or trapezoidal) region by dividing NR
|
||||
into the number of rows in C. A non-zero remainder means we need to
|
||||
add one additional iteration. That is, we want the triangular region
|
||||
to contain as few columns of whole microtiles as possible while still
|
||||
including all microtiles that intersect the diagonal. The number of
|
||||
iterations in the rectangular region is computed as the remaining
|
||||
number of iterations in the n dimension. */ \
|
||||
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
|
||||
n_iter_rct = n_iter - n_iter_tri; \
|
||||
} \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in the
|
||||
2nd loop for the initial triangular region of C (if it exists). For both
|
||||
the rectangular and triangular regions, use contiguous assignment for the
|
||||
1st loop. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -304,7 +347,7 @@ void PASTEMAC(ch,varname) \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -317,12 +360,12 @@ void PASTEMAC(ch,varname) \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -402,6 +445,111 @@ void PASTEMAC(ch,varname) \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no rectangular region, then we're done. */ \
|
||||
if ( n_iter_rct == 0 ) return; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
|
||||
the remaining triangular region of C. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the rectangular region
|
||||
by the number of iterations used for the triangular region. */ \
|
||||
jr_start += n_iter_tri; \
|
||||
jr_end += n_iter_tri; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* No need to compute the diagonal offset for the rectangular
|
||||
region. */ \
|
||||
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
|
||||
|
||||
420
frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
Normal file
420
frame/3/herk/other/bli_herk_l_ker_var2.1looprr.c
Normal file
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_l_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, ip; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely above the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of C intersects
|
||||
the left edge of the panel, adjust the pointer to C and A and treat
|
||||
this case as if the diagonal offset were zero. */ \
|
||||
if ( diagoffc < 0 ) \
|
||||
{ \
|
||||
ip = -diagoffc / MR; \
|
||||
i = ip * MR; \
|
||||
m = m - i; \
|
||||
diagoffc = -diagoffc % MR; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
a_cast = a_cast + (ip )*ps_a; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of C intersects the bottom of the panel, shrink it to prevent
|
||||
"no-op" iterations from executing. */ \
|
||||
if ( diagoffc + m < n ) \
|
||||
{ \
|
||||
n = diagoffc + m; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in
|
||||
the 2nd and 1st loops. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly below the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly above the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
|
||||
|
||||
409
frame/3/herk/other/bli_herk_l_ker_var2.c
Normal file
409
frame/3/herk/other/bli_herk_l_ker_var2.c
Normal file
@@ -0,0 +1,409 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_l_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, ip; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely above the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of C intersects
|
||||
the left edge of the panel, adjust the pointer to C and A and treat
|
||||
this case as if the diagonal offset were zero. */ \
|
||||
if ( diagoffc < 0 ) \
|
||||
{ \
|
||||
ip = -diagoffc / MR; \
|
||||
i = ip * MR; \
|
||||
m = m - i; \
|
||||
diagoffc = -diagoffc % MR; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
a_cast = a_cast + (ip )*ps_a; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of C intersects the bottom of the panel, shrink it to prevent
|
||||
"no-op" iterations from executing. */ \
|
||||
if ( diagoffc + m < n ) \
|
||||
{ \
|
||||
n = diagoffc + m; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly below the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly above the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )
|
||||
|
||||
420
frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
Normal file
420
frame/3/herk/other/bli_herk_u_ker_var2.1looprr.c
Normal file
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_u_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, jp; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely below the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of C
|
||||
intersects the top edge of the panel, adjust the pointer to C and B
|
||||
and treat this case as if the diagonal offset were zero. */ \
|
||||
if ( diagoffc > 0 ) \
|
||||
{ \
|
||||
jp = diagoffc / NR; \
|
||||
j = jp * NR; \
|
||||
n = n - j; \
|
||||
diagoffc = diagoffc % NR; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
b_cast = b_cast + (jp )*ps_b; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of C intersects
|
||||
the right edge of the panel, shrink it to prevent "no-op" iterations
|
||||
from executing. */ \
|
||||
if ( -diagoffc + n < m ) \
|
||||
{ \
|
||||
m = -diagoffc + n; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in
|
||||
the 2nd and 1st loops. */ \
|
||||
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
|
||||
|
||||
409
frame/3/herk/other/bli_herk_u_ker_var2.c
Normal file
409
frame/3/herk/other/bli_herk_u_ker_var2.c
Normal file
@@ -0,0 +1,409 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T herk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffc,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
|
||||
|
||||
|
||||
void bli_herk_u_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffc = bli_obj_diag_offset( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffc,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffc, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffc_ij; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t i, j, jp; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of C is entirely below the diagonal,
|
||||
it is not stored. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of C
|
||||
intersects the top edge of the panel, adjust the pointer to C and B
|
||||
and treat this case as if the diagonal offset were zero. */ \
|
||||
if ( diagoffc > 0 ) \
|
||||
{ \
|
||||
jp = diagoffc / NR; \
|
||||
j = jp * NR; \
|
||||
n = n - j; \
|
||||
diagoffc = diagoffc % NR; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
b_cast = b_cast + (jp )*ps_b; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of C intersects
|
||||
the right edge of the panel, shrink it to prevent "no-op" iterations
|
||||
from executing. */ \
|
||||
if ( -diagoffc + n < m ) \
|
||||
{ \
|
||||
m = -diagoffc + n; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Interior loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
/* Compute the diagonal offset for the submatrix at (i,j). */ \
|
||||
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* If the diagonal intersects the current MR x NR submatrix, we
|
||||
compute it the temporary buffer and then add in the elements
|
||||
on or below the diagonal.
|
||||
Otherwise, if the submatrix is strictly above the diagonal,
|
||||
we compute and store as we normally would.
|
||||
And if we're strictly below the diagonal, we do nothing and
|
||||
continue. */ \
|
||||
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale C and add the result to only the stored part. */ \
|
||||
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the edge of C and add the result. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -85,6 +86,10 @@ void bli_trmm_front
|
||||
}
|
||||
|
||||
#if 0
|
||||
// NOTE: This case casts right-side trmm in terms of left side. This
|
||||
// reduces the number of macrokernels exercised to two (trmm_ll and
|
||||
// trmm_lu) but can lead to the microkernel being executed with an
|
||||
// output matrix that is stored counter to its output preference.
|
||||
|
||||
// If A is being multiplied from the right, transpose all operands
|
||||
// so that we can perform the computation as if A were being multiplied
|
||||
@@ -98,6 +103,11 @@ void bli_trmm_front
|
||||
}
|
||||
|
||||
#else
|
||||
// NOTE: This case computes right-side trmm natively with trmm_rl and
|
||||
// trmm_ru macrokernels. This code path always gives us the opportunity
|
||||
// to transpose the entire operation so that the effective storage format
|
||||
// of the output matrix matches the microkernel's output preference.
|
||||
// Thus, from a performance perspective, this case is preferred.
|
||||
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -317,29 +318,45 @@ void PASTEMAC(ch,varname) \
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
|
||||
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
/*dim_t ir_start, ir_end;*/ \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
|
||||
the initial rectangular region of C (if it exists). For both the
|
||||
rectangular and triangular regions, use contiguous assignment for the
|
||||
1st loop as well. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -369,7 +386,8 @@ void PASTEMAC(ch,varname) \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
/* NOTE: ir loop parallelism disabled for now. */ \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
|
||||
\
|
||||
@@ -379,7 +397,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -433,13 +451,13 @@ void PASTEMAC(ch,varname) \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -449,7 +467,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -498,17 +516,13 @@ void PASTEMAC(ch,varname) \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -324,29 +325,45 @@ void PASTEMAC(ch,varname) \
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
|
||||
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
/*dim_t ir_start, ir_end;*/ \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
|
||||
the initial rectangular region of C (if it exists). For both the
|
||||
rectangular and triangular regions, use contiguous assignment for the
|
||||
1st loop as well. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -376,7 +393,7 @@ void PASTEMAC(ch,varname) \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
|
||||
\
|
||||
@@ -386,7 +403,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -440,13 +457,13 @@ void PASTEMAC(ch,varname) \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
|
||||
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -456,7 +473,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -505,17 +522,13 @@ void PASTEMAC(ch,varname) \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
/*}*/ \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -324,15 +325,151 @@ void PASTEMAC(ch,varname) \
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the rectangular
|
||||
part of B, and the triangular portion. */ \
|
||||
dim_t n_iter_rct; \
|
||||
dim_t n_iter_tri; \
|
||||
\
|
||||
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of B does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the second set of
|
||||
loops. */ \
|
||||
n_iter_rct = n_iter; \
|
||||
n_iter_tri = 0; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of B does intersect the diagonal, compute the number of
|
||||
iterations in the rectangular region by dividing NR into the diagonal
|
||||
offset. (There should never be any remainder in this division.) The
|
||||
number of iterations in the triangular (or trapezoidal) region is
|
||||
computed as the remaining number of iterations in the n dimension. */ \
|
||||
n_iter_rct = diagoffb / NR; \
|
||||
n_iter_tri = n_iter - n_iter_rct; \
|
||||
} \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
|
||||
the initial rectangular region of B (if it exists). For both the
|
||||
rectangular and triangular regions, use contiguous assignment for the
|
||||
1st loop as well. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* If there is no triangular region, then we're done. */ \
|
||||
if ( n_iter_tri == 0 ) return; \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in
|
||||
the 2nd loop for the remaining triangular region of B (if it exists).
|
||||
NOTE: We don't need to call bli_thread_range_jrir*() here since we
|
||||
employ a hack that calls for each thread to execute every iteration
|
||||
of the jr loop but skip all but the pointer increment for iterations
|
||||
that are not assigned to it. */ \
|
||||
\
|
||||
/* Advance the starting b1 and c1 pointers to the positions corresponding
|
||||
to the start of the triangular region of B. */ \
|
||||
jr_start = n_iter_rct; \
|
||||
b1 = b_cast + jr_start * cstep_b; \
|
||||
c1 = c_cast + jr_start * cstep_c; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < n_iter; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -358,7 +495,6 @@ void PASTEMAC(ch,varname) \
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
@@ -366,7 +502,7 @@ void PASTEMAC(ch,varname) \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
if ( bli_trmm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
@@ -375,7 +511,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
if ( bli_trmm_my_iter( i, caucus ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
@@ -390,7 +526,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -449,83 +585,6 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
@@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b0111; \
|
||||
dim_t off_b0111; \
|
||||
dim_t i, j; \
|
||||
dim_t i, j, jb0; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
@@ -324,16 +325,58 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Note that we partition the 2nd loop into two regions: the triangular
|
||||
part of C, and the rectangular portion. */ \
|
||||
dim_t n_iter_tri; \
|
||||
dim_t n_iter_rct; \
|
||||
\
|
||||
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
|
||||
{ \
|
||||
/* If the entire panel of B does not intersect the diagonal, there is
|
||||
no triangular region, and therefore we can skip the first set of
|
||||
loops. */ \
|
||||
n_iter_tri = 0; \
|
||||
n_iter_rct = n_iter; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* If the panel of B does intersect the diagonal, compute the number of
|
||||
iterations in the triangular (or trapezoidal) region by dividing NR
|
||||
into the number of rows in B. (There should never be any remainder
|
||||
in this division.) The number of iterations in the rectangular region
|
||||
is computed as the remaining number of iterations in the n dimension. */ \
|
||||
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
|
||||
n_iter_rct = n_iter - n_iter_tri; \
|
||||
} \
|
||||
\
|
||||
/* Use interleaved (round robin) assignment of micropanels to threads in
|
||||
the 2nd loop for the initial triangular region of B (if it exists).
|
||||
NOTE: We don't need to call bli_thread_range_jrir*() here since we
|
||||
employ a hack that calls for each thread to execute every iteration
|
||||
of the jr loop but skip all but the pointer increment for iterations
|
||||
that are not assigned to it. */ \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = 0; j < n_iter_tri; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -358,7 +401,6 @@ void PASTEMAC(ch,varname) \
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
@@ -366,7 +408,7 @@ void PASTEMAC(ch,varname) \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
if ( bli_trmm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
@@ -375,7 +417,7 @@ void PASTEMAC(ch,varname) \
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
if ( bli_trmm_my_iter( i, caucus ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
@@ -390,7 +432,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -449,30 +491,72 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/* If there is no rectangular region, then we're done. */ \
|
||||
if ( n_iter_rct == 0 ) return; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in both the 2nd and
|
||||
1st loops the remaining triangular region of B. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Advance the start and end iteration offsets for the rectangular region
|
||||
by the number of iterations used for the triangular region. */ \
|
||||
jr_start += n_iter_tri; \
|
||||
jr_end += n_iter_tri; \
|
||||
jb0 = n_iter_tri; \
|
||||
\
|
||||
/* Save the resulting value of b1 from the previous loop since it represents
|
||||
the starting point for the rectangular region. */ \
|
||||
b_cast = b1; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
/* NOTE: We must index through b_cast differently since it contains
|
||||
the starting address of the rectangular region (which is already
|
||||
n_iter_tri logical iterations through B). */ \
|
||||
b1 = b_cast + (j-jb0) * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -517,19 +601,12 @@ void PASTEMAC(ch,varname) \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
519
frame/3/trmm/other/bli_trmm_ll_ker_var2.c
Normal file
519
frame/3/trmm/other/bli_trmm_ll_ker_var2.c
Normal file
@@ -0,0 +1,519 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_ll_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1011; \
|
||||
dim_t off_a1011; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current block of A is entirely above the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
if the diagonal offset were zero. This skips over the region that was
|
||||
not packed. (Note we assume the diagonal offset is a multiple of MR;
|
||||
this assumption will hold as long as the cache blocksizes are each a
|
||||
multiple of MR and NR.) */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
b1. */ \
|
||||
off_a1011 = 0; \
|
||||
k_a1011 = bli_min( diagoffa_i + MR, k ); \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1011 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1011, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
|
||||
|
||||
527
frame/3/trmm/other/bli_trmm_lu_ker_var2.c
Normal file
527
frame/3/trmm/other/bli_trmm_lu_ker_var2.c
Normal file
@@ -0,0 +1,527 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_lu_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1112; \
|
||||
dim_t off_a1112; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current block of A is entirely below the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that we
|
||||
don't need to adjust the pointer to A since packm would have simply
|
||||
skipped over the region that was not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
i = diagoffa; \
|
||||
k = k - i; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of A intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffa + k < m ) \
|
||||
{ \
|
||||
m = -diagoffa + k; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, scale C
|
||||
by beta. If it is strictly above the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict b1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was
|
||||
packed so we can index into the corresponding location in
|
||||
b1. */ \
|
||||
off_a1112 = diagoffa_i; \
|
||||
k_a1112 = k - off_a1112; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1112 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_a1112, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1_i, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
|
||||
|
||||
539
frame/3/trmm/other/bli_trmm_rl_ker_var2.c
Normal file
539
frame/3/trmm/other/bli_trmm_rl_ker_var2.c
Normal file
@@ -0,0 +1,539 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_rl_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b1121; \
|
||||
dim_t off_b1121; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely above the diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of A (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
the left edge of the panel, adjust the pointer to A and treat this
|
||||
case as if the diagonal offset were zero. Note that we don't need to
|
||||
adjust the pointer to B since packm would have simply skipped over
|
||||
the region that was not stored. */ \
|
||||
if ( diagoffb < 0 ) \
|
||||
{ \
|
||||
j = -diagoffb; \
|
||||
k = k - j; \
|
||||
diagoffb = 0; \
|
||||
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of B intersects the bottom of the panel, shrink it to prevent
|
||||
"no-op" iterations from executing. */ \
|
||||
if ( diagoffb + k < n ) \
|
||||
{ \
|
||||
n = diagoffb + k; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
\
|
||||
/* Determine the offset to the beginning of the panel that
|
||||
was packed so we can index into the corresponding location
|
||||
in A. Then compute the length of that panel. */ \
|
||||
off_b1121 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b1121; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_b_cur = k_b1121 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b1121, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
|
||||
|
||||
539
frame/3/trmm/other/bli_trmm_ru_ker_var2.c
Normal file
539
frame/3/trmm/other/bli_trmm_ru_ker_var2.c
Normal file
@@ -0,0 +1,539 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
|
||||
|
||||
|
||||
void bli_trmm_ru_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* jr_thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict one = PASTEMAC(ch,1); \
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b0111; \
|
||||
dim_t off_b0111; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely below its diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full. For all trmm, k_full is simply k. This is
|
||||
needed because some parameter combinations of trmm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of A (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = k; \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. And if we are packing real-only, imag-only, or
|
||||
summed-only, we need to scale the computed panel sizes by 1/2
|
||||
to compensate for the fact that the pointer arithmetic occurs
|
||||
in terms of complex elements rather than real elements. */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
intersects the top edge of the panel, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. This skips over
|
||||
the region that was not packed. (Note we assume the diagonal offset
|
||||
is a multiple of MR; this assumption will hold as long as the cache
|
||||
blocksizes are each a multiple of MR and NR.) */ \
|
||||
if ( diagoffb > 0 ) \
|
||||
{ \
|
||||
j = diagoffb; \
|
||||
n = n - j; \
|
||||
diagoffb = 0; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of B intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffb + n < k ) \
|
||||
{ \
|
||||
k = -diagoffb + n; \
|
||||
} \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
\
|
||||
/* Determine the offset to and length of the panel that was packed
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b0111 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, scale C
|
||||
by beta. If it is strictly below the diagonal, scale by one.
|
||||
This allows the current macro-kernel to work for both trmm
|
||||
and trmm3. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_b_cur = k_b0111 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1_i; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy edge elements of C to the temporary buffer. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
c11, rs_c, cs_c, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k_b0111, \
|
||||
alpha_cast, \
|
||||
a1_i, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
one, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
|
||||
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -60,7 +61,7 @@ void bli_trsm_blk_var1
|
||||
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_mdim
|
||||
bli_thread_range_mdim
|
||||
(
|
||||
direct, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -60,7 +61,7 @@ void bli_trsm_blk_var2
|
||||
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_ndim
|
||||
bli_thread_range_ndim
|
||||
(
|
||||
direct, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -339,25 +340,38 @@ void PASTEMAC(ch,varname) \
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* We don't bother querying the thrinfo_t node for the 1st loop because
|
||||
we can't parallelize that loop in trsm due to the inter-iteration
|
||||
dependencies that exist. */ \
|
||||
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (0 )*rstep_c; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (0 )*rstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
@@ -409,8 +423,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -474,8 +487,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -531,10 +543,6 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -347,25 +348,38 @@ void PASTEMAC(ch,varname) \
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
/* We don't bother querying the thrinfo_t node for the 1st loop because
|
||||
we can't parallelize that loop in trsm due to the inter-iteration
|
||||
dependencies that exist. */ \
|
||||
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t jr_inc; \
|
||||
\
|
||||
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
|
||||
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( ib = 0; ib < m_iter; ++ib ) \
|
||||
@@ -419,8 +433,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -484,8 +497,7 @@ void PASTEMAC(ch,varname) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
@@ -541,10 +553,6 @@ void PASTEMAC(ch,varname) \
|
||||
\
|
||||
c11 -= rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
|
||||
593
frame/3/trsm/other/bli_trsm_ll_ker_var2.c
Normal file
593
frame/3/trsm/other/bli_trsm_ll_ker_var2.c
Normal file
@@ -0,0 +1,593 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_ll_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to B (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of B prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1011; \
|
||||
dim_t k_a10; \
|
||||
dim_t off_a10; \
|
||||
dim_t off_a11; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of MR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of A intersects the
|
||||
left edge of the block, adjust the pointer to C and treat this case as
|
||||
if the diagonal offset were zero. This skips over the region that was
|
||||
not packed. (Note we assume the diagonal offset is a multiple of MR;
|
||||
this assumption will hold as long as the cache blocksizes are each a
|
||||
multiple of MR and NR.) */ \
|
||||
if ( diagoffa < 0 ) \
|
||||
{ \
|
||||
i = -diagoffa; \
|
||||
m = m - i; \
|
||||
diagoffa = 0; \
|
||||
c_cast = c_cast + (i )*rs_c; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of MR. If k
|
||||
isn't a multiple of MR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an MR x MR triangular solve.
|
||||
This adjustment of k is consistent with what happened when A was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
|
||||
know that the underlying buffer was already allocated to have an m
|
||||
dimension that is a multiple of PACKMR, with the region between the
|
||||
last row and the next multiple of MR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (0 )*rstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict b01; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute various offsets into and lengths of parts of A. */ \
|
||||
off_a10 = 0; \
|
||||
k_a1011 = diagoffa_i + MR; \
|
||||
k_a10 = k_a1011 - MR; \
|
||||
off_a11 = k_a10; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1011 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the panel A10 and the triangular
|
||||
block A11. */ \
|
||||
a10 = a1; \
|
||||
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
|
||||
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
|
||||
\
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
|
||||
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + ps_a_cur; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a10, \
|
||||
alpha1_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
b01, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a10, \
|
||||
alpha1_cast, \
|
||||
a10, \
|
||||
a11, \
|
||||
b01, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema_a ) ){ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
|
||||
( double* )b, rs_b, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
|
||||
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
|
||||
}else{ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
|
||||
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
|
||||
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
|
||||
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
if ( bli_is_4mi_packed( schema_a ) ){ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
|
||||
( double* )b, rs_b, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
|
||||
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
|
||||
}else{ \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
|
||||
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
|
||||
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
|
||||
} \
|
||||
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
|
||||
( double* )c, 1, cs_c, "%4.1f", "" ); \
|
||||
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
|
||||
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
|
||||
|
||||
574
frame/3/trsm/other/bli_trsm_lu_ker_var2.c
Normal file
574
frame/3/trsm/other/bli_trsm_lu_ker_var2.c
Normal file
@@ -0,0 +1,574 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffa,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_lu_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffa = bli_obj_diag_offset( a );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to B (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of B prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffa,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffa, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffa_i; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_a1112; \
|
||||
dim_t k_a11; \
|
||||
dim_t k_a12; \
|
||||
dim_t off_a11; \
|
||||
dim_t off_a12; \
|
||||
dim_t i, j, ib; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_a_num; \
|
||||
inc_t ss_a_den; \
|
||||
inc_t ps_a_cur; \
|
||||
inc_t is_a_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
|
||||
So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of MR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_a ) || \
|
||||
bli_is_3mi_packed( schema_a ) || \
|
||||
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
|
||||
else { ss_a_num = 1; ss_a_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of A
|
||||
intersects the top edge of the block, adjust the pointer to B and
|
||||
treat this case as if the diagonal offset were zero. Note that we
|
||||
don't need to adjust the pointer to A since packm would have simply
|
||||
skipped over the region that was not stored. */ \
|
||||
if ( diagoffa > 0 ) \
|
||||
{ \
|
||||
i = diagoffa; \
|
||||
k = k - i; \
|
||||
diagoffa = 0; \
|
||||
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of A intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffa + k < m ) \
|
||||
{ \
|
||||
m = -diagoffa + k; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of MR. If k
|
||||
isn't a multiple of MR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an MR x MR triangular solve.
|
||||
This adjustment of k is consistent with what happened when A was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of B. */ \
|
||||
if ( k % MR != 0 ) k += MR - ( k % MR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
|
||||
know that the underlying buffer was already allocated to have an m
|
||||
dimension that is a multiple of PACKMR, with the region between the
|
||||
last row and the next multiple of MR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k; \
|
||||
istep_b = PACKNR * k_full; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_b( istep_b, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( j, thread ) ) { \
|
||||
\
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (m_iter-1)*rstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( ib = 0; ib < m_iter; ++ib ) \
|
||||
{ \
|
||||
i = m_iter - 1 - ib; \
|
||||
diagoffa_i = diagoffa + ( doff_t )i*MR; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* If the current panel of A intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of A resides above the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is below the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict a12; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict b21; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute various offsets into and lengths of parts of A. */ \
|
||||
off_a11 = diagoffa_i; \
|
||||
k_a1112 = k - off_a11;; \
|
||||
k_a11 = MR; \
|
||||
k_a12 = k_a1112 - MR; \
|
||||
off_a12 = off_a11 + k_a11; \
|
||||
\
|
||||
/* Compute the panel stride for the current diagonal-
|
||||
intersecting micro-panel. */ \
|
||||
is_a_cur = k_a1112 * PACKMR; \
|
||||
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
|
||||
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
|
||||
\
|
||||
/* Compute the addresses of the triangular block A11 and the
|
||||
panel A12. */ \
|
||||
a11 = a1; \
|
||||
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
|
||||
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
|
||||
\
|
||||
/* Compute the addresses of the panel B01 and the block
|
||||
B11. */ \
|
||||
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
|
||||
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + ps_a_cur; \
|
||||
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a12, \
|
||||
alpha1_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
b21, \
|
||||
b11, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_a12, \
|
||||
alpha1_cast, \
|
||||
a12, \
|
||||
a11, \
|
||||
b21, \
|
||||
b11, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += ps_a_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1 + rstep_a; \
|
||||
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1; \
|
||||
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
|
||||
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_is_a( istep_a, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
} \
|
||||
\
|
||||
c11 -= rstep_c; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
|
||||
printf( "m_iter = %lu\n", m_iter ); \
|
||||
printf( "m_cur = %lu\n", m_cur ); \
|
||||
printf( "k = %lu\n", k ); \
|
||||
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
|
||||
printf( "off_a1112 = %lu\n", off_a1112 ); \
|
||||
printf( "k_a1112 = %lu\n", k_a1112 ); \
|
||||
printf( "k_a12 = %lu\n", k_a12 ); \
|
||||
printf( "k_a11 = %lu\n", k_a11 ); \
|
||||
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
|
||||
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
|
||||
*/ \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
|
||||
|
||||
591
frame/3/trsm/other/bli_trsm_rl_ker_var2.c
Normal file
591
frame/3/trsm/other/bli_trsm_rl_ker_var2.c
Normal file
@@ -0,0 +1,591 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_rl_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to A (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of A prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
|
||||
the current macro-kernel targets the "rl" case (right-side/lower-
|
||||
triangular), it becomes upper-triangular after the kernel operation
|
||||
is transposed so that all kernel instances are of the "left"
|
||||
variety (since those are the only trsm ukernels that exist). */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b1121; \
|
||||
dim_t k_b11; \
|
||||
dim_t k_b21; \
|
||||
dim_t off_b11; \
|
||||
dim_t off_b21; \
|
||||
dim_t i, j, jb; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKNR
|
||||
pd_a == NR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKMR
|
||||
cs_b == 1
|
||||
pd_b == MR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
|
||||
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
|
||||
swapping of values in the control tree (ie: those values used when
|
||||
packing). This swapping is needed since we cast right-hand trsm in
|
||||
terms of transposed left-hand trsm. So, if we're going to be
|
||||
transposing the operation, then A needs to be packed with NR and B
|
||||
needs to be packed with MR (remember: B is the triangular matrix in
|
||||
the right-hand side parameter case).
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely above its diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of NR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region above where the diagonal of B intersects
|
||||
the left edge of the panel, adjust the pointer to A and treat this
|
||||
case as if the diagonal offset were zero. Note that we don't need to
|
||||
adjust the pointer to B since packm would have simply skipped over
|
||||
the region that was not stored. */ \
|
||||
if ( diagoffb < 0 ) \
|
||||
{ \
|
||||
j = -diagoffb; \
|
||||
k = k - j; \
|
||||
diagoffb = 0; \
|
||||
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region to the right of where the diagonal
|
||||
of B intersects the bottom of the panel, shrink it so that
|
||||
we can index to the correct place in C (corresponding to the
|
||||
part of the panel of B that was packed).
|
||||
NOTE: This is NOT being done to skip over "no-op" iterations,
|
||||
as with the trsm_lu macro-kernel. This MUST be done for correct
|
||||
execution because we use n (via n_iter) to compute diagonal and
|
||||
index offsets for backwards movement through B. */ \
|
||||
if ( diagoffb + k < n ) \
|
||||
{ \
|
||||
n = diagoffb + k; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of NR. If k
|
||||
isn't a multiple of NR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an NR x NR triangular solve.
|
||||
This adjustment of k is consistent with what happened when B was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of A. */ \
|
||||
if ( k % NR != 0 ) k += NR - ( k % NR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
|
||||
know that the underlying buffer was already allocated to have an n
|
||||
dimension that is a multiple of PACKNR, with the region between the
|
||||
last column and the next multiple of NR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_schema_a( schema_b, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_a, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_b( istep_a, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( jb = 0; jb < n_iter; ++jb ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict b21; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
j = n_iter - 1 - jb; \
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
a1 = a_cast; \
|
||||
c11 = c1 + (n_iter-1)*cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of B resides below the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is above the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Determine the offset to and length of the panel that was packed
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b11 = bli_max( -diagoffb_j, 0 ); \
|
||||
k_b1121 = k - off_b11; \
|
||||
k_b11 = NR; \
|
||||
k_b21 = k_b1121 - NR; \
|
||||
off_b21 = off_b11 + k_b11; \
|
||||
\
|
||||
/* Compute the addresses of the triangular block B11 and the
|
||||
panel B21. */ \
|
||||
b11 = b1; \
|
||||
/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
|
||||
b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
|
||||
\
|
||||
/* Compute the panel stride for the current micro-panel. */ \
|
||||
is_b_cur = k_b1121 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict a12; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the A11 block and A12 panel. */ \
|
||||
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
|
||||
a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
|
||||
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + ps_b_cur; \
|
||||
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. NOTE: We swap the values for A and B since the
|
||||
triangular "A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_next_a( b2, &aux ); \
|
||||
bli_auxinfo_set_next_b( a2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_b21, \
|
||||
alpha1_cast, \
|
||||
b21, \
|
||||
b11, \
|
||||
a12, \
|
||||
a11, \
|
||||
c11, cs_c, rs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_b21, \
|
||||
alpha1_cast, \
|
||||
b21, \
|
||||
b11, \
|
||||
a12, \
|
||||
a11, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_a( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
|
||||
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. NOTE: We swap the values for A and B since the
|
||||
triangular "A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_next_a( b2, &aux ); \
|
||||
bli_auxinfo_set_next_b( a2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
b1, \
|
||||
a1, \
|
||||
alpha2_cast, \
|
||||
c11, cs_c, rs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
b1, \
|
||||
a1, \
|
||||
zero, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 -= cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
|
||||
|
||||
584
frame/3/trsm/other/bli_trsm_ru_ker_var2.c
Normal file
584
frame/3/trsm/other/bli_trsm_ru_ker_var2.c
Normal file
@@ -0,0 +1,584 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
doff_t diagoffb,
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha1,
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
|
||||
void* alpha2,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
|
||||
|
||||
|
||||
void bli_trsm_ru_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
doff_t diagoffb = bli_obj_diag_offset( b );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
void* buf_alpha1;
|
||||
void* buf_alpha2;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to A (the non-triangular matrix). This will be the alpha
|
||||
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
|
||||
// be applied to the packed copy of A prior to it being updated by
|
||||
// the trsm subproblem). This scalar may be unit, if for example it
|
||||
// was applied during packing.
|
||||
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
|
||||
|
||||
// Grab the address of the internal scalar buffer for the scalar
|
||||
// attached to C. This will be the "beta" scalar used in the gemm-only
|
||||
// subproblems that correspond to micro-panels that do not intersect
|
||||
// the diagonal. We need this separate scalar because it's possible
|
||||
// that the alpha attached to B was reset, if it was applied during
|
||||
// packing.
|
||||
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec];
|
||||
|
||||
// Invoke the function.
|
||||
f( diagoffb,
|
||||
schema_a,
|
||||
schema_b,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha1,
|
||||
buf_a, cs_a, pd_a, ps_a,
|
||||
buf_b, rs_b, pd_b, ps_b,
|
||||
buf_alpha2,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname) \
|
||||
( \
|
||||
doff_t diagoffb, \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha1, \
|
||||
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
|
||||
void* alpha2, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
const dim_t PACKMR = cs_a; \
|
||||
const dim_t PACKNR = rs_b; \
|
||||
\
|
||||
/* Cast the micro-kernel address to its function pointer type. */ \
|
||||
/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
|
||||
the current macro-kernel targets the "ru" case (right-side/upper-
|
||||
triangular), it becomes lower-triangular after the kernel operation
|
||||
is transposed so that all kernel instances are of the "left"
|
||||
variety (since those are the only trsm ukernels that exist). */ \
|
||||
PASTECH(ch,gemmtrsm_ukr_ft) \
|
||||
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict minus_one = PASTEMAC(ch,m1); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha1_cast = alpha1; \
|
||||
ctype* restrict alpha2_cast = alpha2; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
doff_t diagoffb_j; \
|
||||
dim_t k_full; \
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
dim_t k_b0111; \
|
||||
dim_t k_b01; \
|
||||
dim_t off_b01; \
|
||||
dim_t off_b11; \
|
||||
dim_t i, j; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
inc_t istep_a; \
|
||||
inc_t istep_b; \
|
||||
inc_t off_scl; \
|
||||
inc_t ss_b_num; \
|
||||
inc_t ss_b_den; \
|
||||
inc_t ps_b_cur; \
|
||||
inc_t is_b_cur; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKNR
|
||||
pd_a == NR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKMR
|
||||
cs_b == 1
|
||||
pd_b == MR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
|
||||
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
|
||||
swapping of values in the control tree (ie: those values used when
|
||||
packing). This swapping is needed since we cast right-hand trsm in
|
||||
terms of transposed left-hand trsm. So, if we're going to be
|
||||
transposing the operation, then A needs to be packed with NR and B
|
||||
needs to be packed with MR (remember: B is the triangular matrix in
|
||||
the right-hand side parameter case).
|
||||
*/ \
|
||||
\
|
||||
/* Safety trap: Certain indexing within this macro-kernel does not
|
||||
work as intended if both MR and NR are odd. */ \
|
||||
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
|
||||
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Safeguard: If the current panel of B is entirely below its diagonal,
|
||||
it is implicitly zero. So we do nothing. */ \
|
||||
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
|
||||
\
|
||||
/* Compute k_full as k inflated up to a multiple of NR. This is
|
||||
needed because some parameter combinations of trsm reduce k
|
||||
to advance past zero regions in the triangular matrix, and
|
||||
when computing the imaginary stride of B (the non-triangular
|
||||
matrix), which is used by 4m1/3m1 implementations, we need
|
||||
this unreduced value of k. */ \
|
||||
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
|
||||
\
|
||||
/* Compute indexing scaling factor for for 4m or 3m. This is
|
||||
needed because one of the packing register blocksizes (PACKMR
|
||||
or PACKNR) is used to index into the micro-panels of the non-
|
||||
triangular matrix when computing with a diagonal-intersecting
|
||||
micro-panel of the triangular matrix. In the case of 4m or 3m,
|
||||
real values are stored in both sub-panels, and so the indexing
|
||||
needs to occur in units of real values. The value computed
|
||||
here is divided into the complex pointer offset to cause the
|
||||
pointer to be advanced by the correct value. */ \
|
||||
if ( bli_is_4mi_packed( schema_b ) || \
|
||||
bli_is_3mi_packed( schema_b ) || \
|
||||
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
|
||||
else off_scl = 1; \
|
||||
\
|
||||
/* Compute the storage stride scaling. Usually this is just 1.
|
||||
However, in the case of interleaved 3m, we need to scale the
|
||||
offset by 3/2. Note that real-only, imag-only, and summed-only
|
||||
packing formats are not applicable here since trsm is a two-
|
||||
operand operation only (unlike trmm, which is capable of three-
|
||||
operand). */ \
|
||||
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
|
||||
else { ss_b_num = 1; ss_b_den = 1; } \
|
||||
\
|
||||
/* If there is a zero region to the left of where the diagonal of B
|
||||
intersects the top edge of the panel, adjust the pointer to C and
|
||||
treat this case as if the diagonal offset were zero. This skips over
|
||||
the region that was not packed. (Note we assume the diagonal offset
|
||||
is a multiple of MR; this assumption will hold as long as the cache
|
||||
blocksizes are each a multiple of MR and NR.) */ \
|
||||
if ( diagoffb > 0 ) \
|
||||
{ \
|
||||
j = diagoffb; \
|
||||
n = n - j; \
|
||||
diagoffb = 0; \
|
||||
c_cast = c_cast + (j )*cs_c; \
|
||||
} \
|
||||
\
|
||||
/* If there is a zero region below where the diagonal of B intersects the
|
||||
right side of the block, shrink it to prevent "no-op" iterations from
|
||||
executing. */ \
|
||||
if ( -diagoffb + n < k ) \
|
||||
{ \
|
||||
k = -diagoffb + n; \
|
||||
} \
|
||||
\
|
||||
/* Check the k dimension, which needs to be a multiple of NR. If k
|
||||
isn't a multiple of NR, we adjust it higher to satisfy the micro-
|
||||
kernel, which is expecting to perform an NR x NR triangular solve.
|
||||
This adjustment of k is consistent with what happened when B was
|
||||
packed: all of its bottom/right edges were zero-padded, and
|
||||
furthermore, the panel that stores the bottom-right corner of the
|
||||
matrix has its diagonal extended into the zero-padded region (as
|
||||
identity). This allows the trsm of that bottom-right panel to
|
||||
proceed without producing any infs or NaNs that would infect the
|
||||
"good" values of the corresponding block of A. */ \
|
||||
if ( k % NR != 0 ) k += NR - ( k % NR ); \
|
||||
\
|
||||
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
|
||||
know that the underlying buffer was already allocated to have an n
|
||||
dimension that is a multiple of PACKNR, with the region between the
|
||||
last column and the next multiple of NR zero-padded accordingly. */ \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
istep_a = PACKMR * k_full; \
|
||||
istep_b = PACKNR * k; \
|
||||
\
|
||||
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
|
||||
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_schema_a( schema_b, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_a, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A to the auxinfo_t object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_b( istep_a, &aux ); \
|
||||
\
|
||||
b1 = b_cast; \
|
||||
c1 = c_cast; \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = 0; j < n_iter; ++j ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b01; \
|
||||
ctype* restrict b11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
diagoffb_j = diagoffb - ( doff_t )j*NR; \
|
||||
a1 = a_cast; \
|
||||
c11 = c1; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* If the current panel of B intersects the diagonal, use a
|
||||
special micro-kernel that performs a fused gemm and trsm.
|
||||
If the current panel of B resides above the diagonal, use a
|
||||
a regular gemm micro-kernel. Otherwise, if it is below the
|
||||
diagonal, it was not packed (because it is implicitly zero)
|
||||
and so we do nothing. */ \
|
||||
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Determine the offset to and length of the panel that was packed
|
||||
so we can index into the corresponding location in A. */ \
|
||||
off_b01 = 0; \
|
||||
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
|
||||
k_b01 = k_b0111 - NR; \
|
||||
off_b11 = k_b01; \
|
||||
\
|
||||
/* Compute the addresses of the panel B10 and the triangular
|
||||
block B11. */ \
|
||||
b01 = b1; \
|
||||
/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
|
||||
b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
|
||||
\
|
||||
/* Compute the panel stride for the current micro-panel. */ \
|
||||
is_b_cur = k_b0111 * PACKNR; \
|
||||
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
|
||||
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
|
||||
\
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a10; \
|
||||
ctype* restrict a11; \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the A10 panel and A11 block. */ \
|
||||
a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \
|
||||
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
|
||||
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + ps_b_cur; \
|
||||
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. NOTE: We swap the values for A and B since the
|
||||
triangular "A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_next_a( b2, &aux ); \
|
||||
bli_auxinfo_set_next_b( a2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_b01, \
|
||||
alpha1_cast, \
|
||||
b01, \
|
||||
b11, \
|
||||
a10, \
|
||||
a11, \
|
||||
c11, cs_c, rs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the fused gemm/trsm micro-kernel. */ \
|
||||
gemmtrsm_ukr \
|
||||
( \
|
||||
k_b01, \
|
||||
alpha1_cast, \
|
||||
b01, \
|
||||
b11, \
|
||||
a10, \
|
||||
a11, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Copy the result to the bottom edge of C. */ \
|
||||
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += ps_b_cur; \
|
||||
} \
|
||||
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
|
||||
{ \
|
||||
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
|
||||
object.
|
||||
NOTE: We swap the values for A and B since the triangular
|
||||
"A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_is_a( istep_b, &aux ); \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = 0; i < m_iter; ++i ) \
|
||||
{ \
|
||||
if( bli_trsm_my_iter( i, thread ) ){ \
|
||||
\
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = a1; \
|
||||
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
|
||||
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = b1 + cstep_b; \
|
||||
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. NOTE: We swap the values for A and B since the
|
||||
triangular "A" matrix is actually contained within B. */ \
|
||||
bli_auxinfo_set_next_a( b2, &aux ); \
|
||||
bli_auxinfo_set_next_b( a2, &aux ); \
|
||||
\
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
b1, \
|
||||
a1, \
|
||||
alpha2_cast, \
|
||||
c11, cs_c, rs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
minus_one, \
|
||||
b1, \
|
||||
a1, \
|
||||
zero, \
|
||||
ct, cs_ct, rs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Add the result to the edge of C. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
alpha2_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
a1 += rstep_a; \
|
||||
c11 += rstep_c; \
|
||||
} \
|
||||
\
|
||||
b1 += cstep_b; \
|
||||
} \
|
||||
\
|
||||
c1 += cstep_c; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
|
||||
// matrix is empty. This is not strictly needed but rather a minor
|
||||
// optimization, as it would prevent threads that would otherwise get
|
||||
// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
|
||||
// because bli_thread_get_range*() would return empty ranges, which would
|
||||
// because bli_thread_range*() would return empty ranges, which would
|
||||
// cause the variant's for loop from executing any iterations.
|
||||
// NOTE: this should only ever execute if the primary object is
|
||||
// triangular because that is the only structure type with subpartitions
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n )
|
||||
!bli_is_strictly_below_diag_n( diagoff, m, n ) );
|
||||
}
|
||||
|
||||
static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
( bli_is_strictly_above_diag_n( diagoff, m, n ) ||
|
||||
bli_is_strictly_below_diag_n( diagoff, m, n ) );
|
||||
}
|
||||
|
||||
static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
@@ -784,10 +792,14 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
|
||||
( i != 0 || n_left == 0 );
|
||||
}
|
||||
|
||||
static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth )
|
||||
static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
|
||||
{
|
||||
return ( bool_t )
|
||||
( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) );
|
||||
#ifdef BLIS_JRIR_INTERLEAVE
|
||||
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
|
||||
#else
|
||||
( i == end_iter - 1 );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -59,9 +59,35 @@ void bli_thread_finalize( void )
|
||||
{
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
#if 0
|
||||
void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
//#ifdef BLIS_JRIR_INTERLEAVE
|
||||
#if 1
|
||||
// Use interleaved partitioning of jr/ir loops.
|
||||
*start = bli_thread_work_id( thread );
|
||||
*inc = bli_thread_n_way( thread );
|
||||
*end = n;
|
||||
#else
|
||||
// Use contiguous slab partitioning for jr/ir loops.
|
||||
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
|
||||
*inc = 1;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_get_range_sub
|
||||
void bli_thread_range_sub
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
@@ -72,6 +98,9 @@ void bli_thread_get_range_sub
|
||||
)
|
||||
{
|
||||
dim_t n_way = bli_thread_n_way( thread );
|
||||
|
||||
if ( n_way == 1 ) { *start = 0; *end = n; return; }
|
||||
|
||||
dim_t work_id = bli_thread_work_id( thread );
|
||||
|
||||
dim_t all_start = 0;
|
||||
@@ -202,7 +231,7 @@ void bli_thread_get_range_sub
|
||||
}
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_l2r
|
||||
siz_t bli_thread_range_l2r
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -216,13 +245,13 @@ siz_t bli_thread_get_range_l2r
|
||||
dim_t n = bli_obj_width_after_trans( a );
|
||||
dim_t bf = bli_blksz_get_def( dt, bmult );
|
||||
|
||||
bli_thread_get_range_sub( thr, n, bf,
|
||||
FALSE, start, end );
|
||||
bli_thread_range_sub( thr, n, bf,
|
||||
FALSE, start, end );
|
||||
|
||||
return m * ( *end - *start );
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_r2l
|
||||
siz_t bli_thread_range_r2l
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -236,13 +265,13 @@ siz_t bli_thread_get_range_r2l
|
||||
dim_t n = bli_obj_width_after_trans( a );
|
||||
dim_t bf = bli_blksz_get_def( dt, bmult );
|
||||
|
||||
bli_thread_get_range_sub( thr, n, bf,
|
||||
TRUE, start, end );
|
||||
bli_thread_range_sub( thr, n, bf,
|
||||
TRUE, start, end );
|
||||
|
||||
return m * ( *end - *start );
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_t2b
|
||||
siz_t bli_thread_range_t2b
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -256,13 +285,13 @@ siz_t bli_thread_get_range_t2b
|
||||
dim_t n = bli_obj_width_after_trans( a );
|
||||
dim_t bf = bli_blksz_get_def( dt, bmult );
|
||||
|
||||
bli_thread_get_range_sub( thr, m, bf,
|
||||
FALSE, start, end );
|
||||
bli_thread_range_sub( thr, m, bf,
|
||||
FALSE, start, end );
|
||||
|
||||
return n * ( *end - *start );
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_b2t
|
||||
siz_t bli_thread_range_b2t
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -276,15 +305,15 @@ siz_t bli_thread_get_range_b2t
|
||||
dim_t n = bli_obj_width_after_trans( a );
|
||||
dim_t bf = bli_blksz_get_def( dt, bmult );
|
||||
|
||||
bli_thread_get_range_sub( thr, m, bf,
|
||||
TRUE, start, end );
|
||||
bli_thread_range_sub( thr, m, bf,
|
||||
TRUE, start, end );
|
||||
|
||||
return n * ( *end - *start );
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
dim_t bli_thread_get_range_width_l
|
||||
dim_t bli_thread_range_width_l
|
||||
(
|
||||
doff_t diagoff_j,
|
||||
dim_t m,
|
||||
@@ -495,17 +524,17 @@ siz_t bli_find_area_trap_l
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
siz_t bli_thread_get_range_weighted_sub
|
||||
siz_t bli_thread_range_weighted_sub
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* j_start_thr,
|
||||
dim_t* j_end_thr
|
||||
thrinfo_t* restrict thread,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* restrict j_start_thr,
|
||||
dim_t* restrict j_end_thr
|
||||
)
|
||||
{
|
||||
dim_t n_way = bli_thread_n_way( thread );
|
||||
@@ -570,7 +599,7 @@ siz_t bli_thread_get_range_weighted_sub
|
||||
// Compute the width of the jth subpartition, taking the
|
||||
// current diagonal offset into account, if needed.
|
||||
width_j =
|
||||
bli_thread_get_range_width_l
|
||||
bli_thread_range_width_l
|
||||
(
|
||||
diagoff_j, m, n_left,
|
||||
j, n_way,
|
||||
@@ -614,7 +643,7 @@ siz_t bli_thread_get_range_weighted_sub
|
||||
bli_toggle_bool( &handle_edge_low );
|
||||
|
||||
// Compute the appropriate range for the rotated trapezoid.
|
||||
area = bli_thread_get_range_weighted_sub
|
||||
area = bli_thread_range_weighted_sub
|
||||
(
|
||||
thread, diagoff, uplo, m, n, bf,
|
||||
handle_edge_low,
|
||||
@@ -632,7 +661,7 @@ siz_t bli_thread_get_range_weighted_sub
|
||||
return area;
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_mdim
|
||||
siz_t bli_thread_range_mdim
|
||||
(
|
||||
dir_t direct,
|
||||
thrinfo_t* thr,
|
||||
@@ -678,20 +707,20 @@ siz_t bli_thread_get_range_mdim
|
||||
if ( use_weighted )
|
||||
{
|
||||
if ( direct == BLIS_FWD )
|
||||
return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end );
|
||||
return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
|
||||
else
|
||||
return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end );
|
||||
return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( direct == BLIS_FWD )
|
||||
return bli_thread_get_range_t2b( thr, x, bmult, start, end );
|
||||
return bli_thread_range_t2b( thr, x, bmult, start, end );
|
||||
else
|
||||
return bli_thread_get_range_b2t( thr, x, bmult, start, end );
|
||||
return bli_thread_range_b2t( thr, x, bmult, start, end );
|
||||
}
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_ndim
|
||||
siz_t bli_thread_range_ndim
|
||||
(
|
||||
dir_t direct,
|
||||
thrinfo_t* thr,
|
||||
@@ -737,20 +766,20 @@ siz_t bli_thread_get_range_ndim
|
||||
if ( use_weighted )
|
||||
{
|
||||
if ( direct == BLIS_FWD )
|
||||
return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end );
|
||||
return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
|
||||
else
|
||||
return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end );
|
||||
return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( direct == BLIS_FWD )
|
||||
return bli_thread_get_range_l2r( thr, x, bmult, start, end );
|
||||
return bli_thread_range_l2r( thr, x, bmult, start, end );
|
||||
else
|
||||
return bli_thread_get_range_r2l( thr, x, bmult, start, end );
|
||||
return bli_thread_range_r2l( thr, x, bmult, start, end );
|
||||
}
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_weighted_l2r
|
||||
siz_t bli_thread_range_weighted_l2r
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -782,7 +811,7 @@ siz_t bli_thread_get_range_weighted_l2r
|
||||
}
|
||||
|
||||
area =
|
||||
bli_thread_get_range_weighted_sub
|
||||
bli_thread_range_weighted_sub
|
||||
(
|
||||
thr, diagoff, uplo, m, n, bf,
|
||||
FALSE, start, end
|
||||
@@ -790,7 +819,7 @@ siz_t bli_thread_get_range_weighted_l2r
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
area = bli_thread_get_range_l2r
|
||||
area = bli_thread_range_l2r
|
||||
(
|
||||
thr, a, bmult,
|
||||
start, end
|
||||
@@ -800,7 +829,7 @@ siz_t bli_thread_get_range_weighted_l2r
|
||||
return area;
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_weighted_r2l
|
||||
siz_t bli_thread_range_weighted_r2l
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -834,7 +863,7 @@ siz_t bli_thread_get_range_weighted_r2l
|
||||
bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
|
||||
|
||||
area =
|
||||
bli_thread_get_range_weighted_sub
|
||||
bli_thread_range_weighted_sub
|
||||
(
|
||||
thr, diagoff, uplo, m, n, bf,
|
||||
TRUE, start, end
|
||||
@@ -842,7 +871,7 @@ siz_t bli_thread_get_range_weighted_r2l
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
area = bli_thread_get_range_r2l
|
||||
area = bli_thread_range_r2l
|
||||
(
|
||||
thr, a, bmult,
|
||||
start, end
|
||||
@@ -852,7 +881,7 @@ siz_t bli_thread_get_range_weighted_r2l
|
||||
return area;
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_weighted_t2b
|
||||
siz_t bli_thread_range_weighted_t2b
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -886,7 +915,7 @@ siz_t bli_thread_get_range_weighted_t2b
|
||||
bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
|
||||
|
||||
area =
|
||||
bli_thread_get_range_weighted_sub
|
||||
bli_thread_range_weighted_sub
|
||||
(
|
||||
thr, diagoff, uplo, m, n, bf,
|
||||
FALSE, start, end
|
||||
@@ -894,7 +923,7 @@ siz_t bli_thread_get_range_weighted_t2b
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
area = bli_thread_get_range_t2b
|
||||
area = bli_thread_range_t2b
|
||||
(
|
||||
thr, a, bmult,
|
||||
start, end
|
||||
@@ -904,7 +933,7 @@ siz_t bli_thread_get_range_weighted_t2b
|
||||
return area;
|
||||
}
|
||||
|
||||
siz_t bli_thread_get_range_weighted_b2t
|
||||
siz_t bli_thread_range_weighted_b2t
|
||||
(
|
||||
thrinfo_t* thr,
|
||||
obj_t* a,
|
||||
@@ -939,7 +968,7 @@ siz_t bli_thread_get_range_weighted_b2t
|
||||
|
||||
bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
|
||||
|
||||
area = bli_thread_get_range_weighted_sub
|
||||
area = bli_thread_range_weighted_sub
|
||||
(
|
||||
thr, diagoff, uplo, m, n, bf,
|
||||
TRUE, start, end
|
||||
@@ -947,7 +976,7 @@ siz_t bli_thread_get_range_weighted_b2t
|
||||
}
|
||||
else // if dense or zeros
|
||||
{
|
||||
area = bli_thread_get_range_b2t
|
||||
area = bli_thread_range_b2t
|
||||
(
|
||||
thr, a, bmult,
|
||||
start, end
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -56,7 +57,21 @@ void bli_thread_finalize( void );
|
||||
#endif
|
||||
|
||||
// Thread range-related prototypes.
|
||||
void bli_thread_get_range_sub
|
||||
#if 0
|
||||
void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
);
|
||||
#endif
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void bli_thread_range_sub
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
@@ -82,8 +97,8 @@ siz_t PASTEMAC0( opname ) \
|
||||
dim_t* end \
|
||||
);
|
||||
|
||||
GENPROT( thread_get_range_mdim )
|
||||
GENPROT( thread_get_range_ndim )
|
||||
GENPROT( thread_range_mdim )
|
||||
GENPROT( thread_range_ndim )
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
@@ -97,18 +112,18 @@ siz_t PASTEMAC0( opname ) \
|
||||
dim_t* end \
|
||||
);
|
||||
|
||||
GENPROT( thread_get_range_l2r )
|
||||
GENPROT( thread_get_range_r2l )
|
||||
GENPROT( thread_get_range_t2b )
|
||||
GENPROT( thread_get_range_b2t )
|
||||
GENPROT( thread_range_l2r )
|
||||
GENPROT( thread_range_r2l )
|
||||
GENPROT( thread_range_t2b )
|
||||
GENPROT( thread_range_b2t )
|
||||
|
||||
GENPROT( thread_get_range_weighted_l2r )
|
||||
GENPROT( thread_get_range_weighted_r2l )
|
||||
GENPROT( thread_get_range_weighted_t2b )
|
||||
GENPROT( thread_get_range_weighted_b2t )
|
||||
GENPROT( thread_range_weighted_l2r )
|
||||
GENPROT( thread_range_weighted_r2l )
|
||||
GENPROT( thread_range_weighted_t2b )
|
||||
GENPROT( thread_range_weighted_b2t )
|
||||
|
||||
|
||||
dim_t bli_thread_get_range_width_l
|
||||
dim_t bli_thread_range_width_l
|
||||
(
|
||||
doff_t diagoff_j,
|
||||
dim_t m,
|
||||
@@ -126,17 +141,17 @@ siz_t bli_find_area_trap_l
|
||||
dim_t n,
|
||||
doff_t diagoff
|
||||
);
|
||||
siz_t bli_thread_get_range_weighted_sub
|
||||
siz_t bli_thread_range_weighted_sub
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* j_start_thr,
|
||||
dim_t* j_end_thr
|
||||
thrinfo_t* restrict thread,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* restrict j_start_thr,
|
||||
dim_t* restrict j_end_thr
|
||||
);
|
||||
|
||||
|
||||
@@ -215,5 +230,112 @@ void bli_thread_init_rntm( rntm_t* rntm );
|
||||
|
||||
void bli_thread_init_rntm_from_env( rntm_t* rntm );
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc );
|
||||
|
||||
static void bli_thread_range_jrir_rr
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
// Use interleaved partitioning of jr/ir loops.
|
||||
*start = bli_thread_work_id( thread );
|
||||
*inc = bli_thread_n_way( thread );
|
||||
*end = n;
|
||||
}
|
||||
|
||||
static void bli_thread_range_jrir_sl
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
// Use contiguous slab partitioning of jr/ir loops.
|
||||
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
|
||||
*inc = 1;
|
||||
}
|
||||
|
||||
static void bli_thread_range_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
//#ifdef BLIS_JRIR_INTERLEAVE
|
||||
#if 0
|
||||
bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
|
||||
#else
|
||||
bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
|
||||
#endif
|
||||
}
|
||||
|
||||
static void bli_thread_range_weighted_jrir
|
||||
(
|
||||
thrinfo_t* thread,
|
||||
doff_t diagoff,
|
||||
uplo_t uplo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t bf,
|
||||
bool_t handle_edge_low,
|
||||
dim_t* start,
|
||||
dim_t* end,
|
||||
dim_t* inc
|
||||
)
|
||||
{
|
||||
#ifdef BLIS_JRIR_INTERLEAVE
|
||||
// Use interleaved partitioning of jr/ir loops.
|
||||
*start = bli_thread_work_id( thread );
|
||||
*inc = bli_thread_n_way( thread );
|
||||
*end = n;
|
||||
#else
|
||||
// Use contiguous slab partitioning for jr/ir loops.
|
||||
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
|
||||
handle_edge_low, start, end );
|
||||
|
||||
*start = *start / bf; *inc = 1;
|
||||
|
||||
if ( *end % bf ) *end = *end / bf + 1;
|
||||
else *end = *end / bf;
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
const dim_t n_way = bli_thread_n_way( thread );
|
||||
|
||||
if ( m * n / n_way > 25000 )
|
||||
{
|
||||
// Use contiguous slab partitioning for jr/ir loops.
|
||||
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
|
||||
handle_edge_low, start, end );
|
||||
*inc = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use interleaved partitioning of jr/ir loops.
|
||||
*start = bli_thread_work_id( thread );
|
||||
*inc = n_way; //bli_thread_n_way( thread );
|
||||
*end = n;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -46,10 +47,10 @@ void blx_gemm_int
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
gemm_voft f;
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
gemm_var_oft f;
|
||||
|
||||
// Alias A, B, and C in case we need to update attached scalars.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -52,7 +53,7 @@ void blx_gemm_blk_var1
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_mdim
|
||||
bli_thread_range_mdim
|
||||
(
|
||||
BLIS_FWD, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -52,7 +53,7 @@ void blx_gemm_blk_var2
|
||||
dim_t my_start, my_end;
|
||||
|
||||
// Determine the current thread's subpartition range.
|
||||
bli_thread_get_range_ndim
|
||||
bli_thread_range_ndim
|
||||
(
|
||||
BLIS_FWD, thread, a, b, c, cntl, cntx,
|
||||
&my_start, &my_end
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -255,14 +256,27 @@ void PASTECH2(blx_,ch,varname) \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
dim_t jr_num_threads = bli_thread_n_way( thread ); \
|
||||
dim_t jr_thread_id = bli_thread_work_id( thread ); \
|
||||
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
/* Determine the thread range and increment for each thrinfo_t node. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
@@ -277,7 +291,7 @@ void PASTECH2(blx_,ch,varname) \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
@@ -287,12 +301,12 @@ void PASTECH2(blx_,ch,varname) \
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
|
||||
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
|
||||
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
# Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
@@ -200,13 +201,13 @@ STR_ST := -DTHR_STR=\"st\"
|
||||
STR_MT := -DTHR_STR=\"mt\"
|
||||
|
||||
# Problem size specification
|
||||
PDEF_ST := -DP_BEGIN=40 \
|
||||
PDEF_ST := -DP_BEGIN=96 \
|
||||
-DP_END=2000 \
|
||||
-DP_INC=40
|
||||
-DP_INC=96
|
||||
|
||||
PDEF_MT := -DP_BEGIN=200 \
|
||||
-DP_END=10000 \
|
||||
-DP_INC=200
|
||||
PDEF_MT := -DP_BEGIN=192 \
|
||||
-DP_END=3000 \
|
||||
-DP_INC=192
|
||||
|
||||
|
||||
|
||||
@@ -226,9 +227,6 @@ all-mt: blis-mt openblas-mt mkl-mt
|
||||
blis-st: blis-gemm-st
|
||||
blis-mt: blis-gemm-mt
|
||||
|
||||
blis-nat-st: blis-gemm-nat-st
|
||||
blis-nat-mt: blis-gemm-nat-mt
|
||||
|
||||
openblas-st: openblas-gemm-st
|
||||
openblas-mt: openblas-gemm-mt
|
||||
|
||||
@@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \
|
||||
blis-gemm-mt: blis-gemm-nat-mt \
|
||||
blis-gemm-ind-mt
|
||||
|
||||
blis-nat-st: \
|
||||
test_sgemm_asm_blis_st.x \
|
||||
test_dgemm_asm_blis_st.x \
|
||||
test_cgemm_asm_blis_st.x \
|
||||
test_zgemm_asm_blis_st.x \
|
||||
test_sherk_asm_blis_st.x \
|
||||
test_dherk_asm_blis_st.x \
|
||||
test_cherk_asm_blis_st.x \
|
||||
test_zherk_asm_blis_st.x \
|
||||
test_strmm_asm_blis_st.x \
|
||||
test_dtrmm_asm_blis_st.x \
|
||||
test_ctrmm_asm_blis_st.x \
|
||||
test_ztrmm_asm_blis_st.x \
|
||||
test_strsm_asm_blis_st.x \
|
||||
test_dtrsm_asm_blis_st.x \
|
||||
test_ctrsm_asm_blis_st.x \
|
||||
test_ztrsm_asm_blis_st.x
|
||||
|
||||
blis-nat-mt: \
|
||||
test_sgemm_asm_blis_mt.x \
|
||||
test_dgemm_asm_blis_mt.x \
|
||||
test_cgemm_asm_blis_mt.x \
|
||||
test_zgemm_asm_blis_mt.x \
|
||||
test_sherk_asm_blis_mt.x \
|
||||
test_dherk_asm_blis_mt.x \
|
||||
test_cherk_asm_blis_mt.x \
|
||||
test_zherk_asm_blis_mt.x \
|
||||
test_strmm_asm_blis_mt.x \
|
||||
test_dtrmm_asm_blis_mt.x \
|
||||
test_ctrmm_asm_blis_mt.x \
|
||||
test_ztrmm_asm_blis_mt.x \
|
||||
test_strsm_asm_blis_mt.x \
|
||||
test_dtrsm_asm_blis_mt.x \
|
||||
test_ctrsm_asm_blis_mt.x \
|
||||
test_ztrsm_asm_blis_mt.x
|
||||
|
||||
blis-gemm-nat-st: \
|
||||
test_sgemm_asm_blis_st.x \
|
||||
test_dgemm_asm_blis_st.x \
|
||||
@@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@
|
||||
|
||||
# blis asm
|
||||
test_d%_asm_blis_st.o: test_%.c
|
||||
test_d%_asm_blis_st.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_s%_asm_blis_st.o: test_%.c
|
||||
test_s%_asm_blis_st.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_z%_asm_blis_st.o: test_%.c
|
||||
test_z%_asm_blis_st.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_c%_asm_blis_st.o: test_%.c
|
||||
test_c%_asm_blis_st.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_d%_asm_blis_mt.o: test_%.c
|
||||
test_d%_asm_blis_mt.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_s%_asm_blis_mt.o: test_%.c
|
||||
test_s%_asm_blis_mt.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_z%_asm_blis_mt.o: test_%.c
|
||||
test_z%_asm_blis_mt.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_c%_asm_blis_mt.o: test_%.c
|
||||
test_c%_asm_blis_mt.o: test_%.c Makefile
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
|
||||
|
||||
# openblas
|
||||
|
||||
314
test/3m4m/test_herk.c
Normal file
314
test/3m4m/test_herk.c
Normal file
@@ -0,0 +1,314 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t m, k;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, k_input;
|
||||
ind_t ind;
|
||||
num_t dt, dt_real;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
uplo_t uploc;
|
||||
trans_t transa;
|
||||
f77_char f77_uploc;
|
||||
f77_char f77_transa;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
dt_real = bli_dt_proj_to_real( DT );
|
||||
|
||||
ind = IND;
|
||||
|
||||
p_begin = P_BEGIN;
|
||||
p_end = P_END;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
k_input = -1;
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
uploc = BLIS_LOWER;
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
|
||||
bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt_real, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
if ( bli_does_trans( transa ) )
|
||||
bli_obj_create( dt, k, m, 0, 0, &a );
|
||||
else
|
||||
bli_obj_create( dt, m, k, 0, 0, &a );
|
||||
bli_obj_create( dt, m, m, 0, 0, &c );
|
||||
//bli_obj_create( dt, m, k, 2, 2*m, &a );
|
||||
//bli_obj_create( dt, k, n, 2, 2*k, &b );
|
||||
//bli_obj_create( dt, m, n, 2, 2*m, &c );
|
||||
bli_obj_create( dt, m, m, 0, 0, &c_save );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, &c );
|
||||
bli_obj_set_uplo( uploc, &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
bli_setsc( (1.0/1.0), 0.0, &beta );
|
||||
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#ifdef BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_herk( &alpha,
|
||||
&a,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
ssyrk_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dsyrk_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cherk_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
zherk_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )k, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
328
test/3m4m/test_trmm.c
Normal file
328
test/3m4m/test_trmm.c
Normal file
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha;
|
||||
dim_t m, n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, n_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
side_t side;
|
||||
uplo_t uploa;
|
||||
trans_t transa;
|
||||
diag_t diaga;
|
||||
f77_char f77_side;
|
||||
f77_char f77_uploa;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_diaga;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
p_begin = P_BEGIN;
|
||||
p_end = P_END;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
#if 0
|
||||
side = BLIS_LEFT;
|
||||
#else
|
||||
side = BLIS_RIGHT;
|
||||
#endif
|
||||
#if 0
|
||||
uploa = BLIS_LOWER;
|
||||
#else
|
||||
uploa = BLIS_UPPER;
|
||||
#endif
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
diaga = BLIS_NONUNIT_DIAG;
|
||||
|
||||
bli_param_map_blis_to_netlib_side( side, &f77_side );
|
||||
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
|
||||
if ( bli_does_trans( side ) )
|
||||
bli_obj_create( dt, m, m, 0, 0, &a );
|
||||
else
|
||||
bli_obj_create( dt, n, n, 0, 0, &a );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
|
||||
bli_obj_set_uplo( uploa, &a );
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_diag( diaga, &a );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_mktrim( &a );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#ifdef BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_trmm( side,
|
||||
&alpha,
|
||||
&a,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
strmm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dtrmm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ctrmm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ztrmm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
if ( bli_is_left( side ) )
|
||||
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
|
||||
else
|
||||
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
338
test/3m4m/test_trsm.c
Normal file
338
test/3m4m/test_trsm.c
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//#define PRINT
|
||||
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, c, d;
|
||||
obj_t c_save;
|
||||
obj_t alpha;
|
||||
dim_t m, n;
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int m_input, n_input;
|
||||
ind_t ind;
|
||||
num_t dt;
|
||||
char dt_ch;
|
||||
int r, n_repeats;
|
||||
side_t side;
|
||||
uplo_t uploa;
|
||||
trans_t transa;
|
||||
diag_t diaga;
|
||||
f77_char f77_side;
|
||||
f77_char f77_uploa;
|
||||
f77_char f77_transa;
|
||||
f77_char f77_diaga;
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
dt = DT;
|
||||
|
||||
ind = IND;
|
||||
|
||||
p_begin = P_BEGIN;
|
||||
p_end = P_END;
|
||||
p_inc = P_INC;
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
|
||||
|
||||
// Supress compiler warnings about unused variable 'ind'.
|
||||
( void )ind;
|
||||
|
||||
#if 0
|
||||
|
||||
cntx_t* cntx;
|
||||
|
||||
ind_t ind_mod = ind;
|
||||
|
||||
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
|
||||
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
|
||||
|
||||
// Initialize a context for the current induced method and datatype.
|
||||
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
|
||||
|
||||
// Set k to the kc blocksize for the current datatype.
|
||||
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
|
||||
|
||||
#elif 1
|
||||
|
||||
//k_input = 256;
|
||||
|
||||
#endif
|
||||
|
||||
// Choose the char corresponding to the requested datatype.
|
||||
if ( bli_is_float( dt ) ) dt_ch = 's';
|
||||
else if ( bli_is_double( dt ) ) dt_ch = 'd';
|
||||
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
|
||||
else dt_ch = 'z';
|
||||
|
||||
#if 0
|
||||
side = BLIS_LEFT;
|
||||
#else
|
||||
side = BLIS_RIGHT;
|
||||
#endif
|
||||
#if 0
|
||||
uploa = BLIS_LOWER;
|
||||
#else
|
||||
uploa = BLIS_UPPER;
|
||||
#endif
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
diaga = BLIS_NONUNIT_DIAG;
|
||||
|
||||
bli_param_map_blis_to_netlib_side( side, &f77_side );
|
||||
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
|
||||
|
||||
// Begin with initializing the last entry to zero so that
|
||||
// matlab allocates space for the entire array once up-front.
|
||||
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )0,
|
||||
( unsigned long )0, 0.0 );
|
||||
|
||||
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
|
||||
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
|
||||
else m = ( dim_t ) m_input;
|
||||
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
|
||||
if ( bli_does_trans( side ) )
|
||||
bli_obj_create( dt, m, m, 0, 0, &a );
|
||||
else
|
||||
bli_obj_create( dt, n, n, 0, 0, &a );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c );
|
||||
//bli_obj_create( dt, m, n, n, 1, &c );
|
||||
bli_obj_create( dt, m, n, 0, 0, &c_save );
|
||||
|
||||
if ( bli_does_trans( side ) )
|
||||
bli_obj_create( dt, m, m, 0, 0, &d );
|
||||
else
|
||||
bli_obj_create( dt, n, n, 0, 0, &d );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &c );
|
||||
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
|
||||
bli_obj_set_uplo( uploa, &a );
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_diag( diaga, &a );
|
||||
|
||||
bli_randm( &a );
|
||||
bli_mktrim( &a );
|
||||
|
||||
bli_setd( &BLIS_TWO, &d );
|
||||
bli_addd( &d, &a );
|
||||
|
||||
bli_setsc( (2.0/1.0), 0.0, &alpha );
|
||||
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
#ifdef BLIS
|
||||
bli_ind_disable_all_dt( dt );
|
||||
bli_ind_enable_dt( ind, dt );
|
||||
#endif
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "" );
|
||||
bli_printm( "c", &c, "%4.1f", "" );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
|
||||
bli_trsm( side,
|
||||
&alpha,
|
||||
&a,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
strsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dtrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ctrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
f77_int mm = bli_obj_length( &c );
|
||||
f77_int kk = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
ztrsm_( &f77_side,
|
||||
&f77_uploa,
|
||||
&f77_transa,
|
||||
&f77_diaga,
|
||||
&mm,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
cp, &ldc );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
if ( bli_is_left( side ) )
|
||||
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
|
||||
else
|
||||
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
|
||||
#else
|
||||
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
|
||||
#endif
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin + 1)/p_inc + 1,
|
||||
( unsigned long )m,
|
||||
( unsigned long )n, gflops );
|
||||
|
||||
bli_obj_free( &alpha );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
bli_obj_free( &d );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -290,13 +291,13 @@ int main( int argc, char** argv )
|
||||
thrinfo.work_id = t;
|
||||
|
||||
if ( part_n_dim && go_fwd )
|
||||
area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
|
||||
area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
|
||||
else if ( part_n_dim && go_bwd )
|
||||
area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
|
||||
area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
|
||||
else if ( part_m_dim && go_fwd )
|
||||
area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
|
||||
area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
|
||||
else // ( part_m_dim && go_bwd )
|
||||
area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
|
||||
area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
|
||||
|
||||
width = end - start;
|
||||
|
||||
|
||||
@@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt
|
||||
bli_thread_get_jr_nt
|
||||
bli_thread_get_num_threads
|
||||
bli_thread_get_pc_nt
|
||||
bli_thread_get_range_b2t
|
||||
bli_thread_get_range_l2r
|
||||
bli_thread_get_range_mdim
|
||||
bli_thread_get_range_ndim
|
||||
bli_thread_get_range_r2l
|
||||
bli_thread_get_range_sub
|
||||
bli_thread_get_range_t2b
|
||||
bli_thread_get_range_weighted_b2t
|
||||
bli_thread_get_range_weighted_l2r
|
||||
bli_thread_get_range_weighted_r2l
|
||||
bli_thread_get_range_weighted_sub
|
||||
bli_thread_get_range_weighted_t2b
|
||||
bli_thread_get_range_width_l
|
||||
bli_thread_range_b2t
|
||||
bli_thread_range_l2r
|
||||
bli_thread_range_mdim
|
||||
bli_thread_range_ndim
|
||||
bli_thread_range_r2l
|
||||
bli_thread_range_sub
|
||||
bli_thread_range_t2b
|
||||
bli_thread_range_weighted_b2t
|
||||
bli_thread_range_weighted_l2r
|
||||
bli_thread_range_weighted_r2l
|
||||
bli_thread_range_weighted_sub
|
||||
bli_thread_range_weighted_t2b
|
||||
bli_thread_range_width_l
|
||||
bli_thread_init
|
||||
bli_thread_init_rntm
|
||||
bli_thread_init_rntm_from_env
|
||||
|
||||
Reference in New Issue
Block a user