Multithreading optimizations for l3 macrokernels.

Details:
- Adjusted the method by which micropanels are assigned to threads in
  the 2nd (jr) and 1st (ir) loops around the microkernel to (mostly)
  employ contiguous "slab" partitioning rather than interleaved (round
  robin) partitioning. The new partitioning schemes and related details
  for specific families of operations are listed below:
  - gemm: slab partitioning.
  - herk: slab partitioning for region corresponding to non-triangular
          region of C; round robin partitioning for triangular region.
  - trmm: slab partitioning for region corresponding to non-triangular
          region of B; round robin partitioning for triangular region.
          (NOTE: This affects both left- and right-side macrokernels:
          trmm_ll, trmm_lu, trmm_rl, trmm_ru.)
  - trsm: slab partitioning.
          (NOTE: This only affects only left-side macrokernels trsm_ll,
          trsm_lu; right-side macrokernels were not touched.)
  Also note that the previous macrokernels were preserved inside of
  the 'other' directory of each operation family directory (e.g.
  frame/3/gemm/other, frame/3/herk/other, etc).
- Updated gemm macrokernel in sandbox/ref99 in light of above changes
  and fixed a stale function pointer type in blx_gemm_int.c
  (gemm_voft -> gemm_var_oft).
- Added standalone test drivers in test/3m4m for herk, trmm, and trsm
  and minor changes to test/3m4m/Makefile.
- Updated the arguments and definitions of bli_*_get_next_[ab]_upanel()
  and bli_trmm_?_?r_my_iter() macros defined in bli_l3_thrinfo.h.
- Renamed bli_thread_get_range*() APIs to bli_thread_range*().
This commit is contained in:
Field G. Van Zee
2018-09-30 18:54:56 -05:00
parent 60b2650d74
commit ac18949a4b
43 changed files with 8562 additions and 366 deletions

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -38,24 +39,28 @@
// gemm
#define bli_gemm_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define bli_gemm_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// herk
#define bli_herk_get_next_a_upanel( thread, a1, step ) ( a1 + step * thread->n_way )
#define bli_herk_get_next_b_upanel( thread, b1, step ) ( b1 + step * thread->n_way )
#define bli_herk_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_herk_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// trmm
#define bli_trmm_r_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define bli_trmm_r_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define bli_trmm_l_ir_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define bli_trmm_l_jr_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
#define bli_trmm_my_iter( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
// trsm
#define bli_trsm_my_iter( index, thread ) ( index % thread->n_way == thread->work_id % thread->n_way )
#define bli_trsm_my_iter( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
//
// thrinfo_t APIs specific to level-3 operations.

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var1
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_mdim
bli_thread_range_mdim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_gemm_blk_var2
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_ndim
bli_thread_range_ndim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -268,14 +269,27 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -290,7 +304,7 @@ void PASTEMAC(ch,varname) \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
@@ -300,12 +314,12 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -251,6 +252,9 @@ void PASTEMAC(ch,varname) \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
dim_t jr_inc = jr_num_threads; \
dim_t ir_inc = ir_num_threads; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
@@ -295,11 +299,11 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \

View File

@@ -0,0 +1,366 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
void bli_gemm_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
#if 1
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -279,17 +280,57 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of C, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. Any remainder from this integer division is discarded, which
is what we want. That is, we want the rectangular region to contain
as many columns of whole microtiles as possible without including any
microtiles that intersect the diagonal. The number of iterations in
the triangular (or trapezoidal) region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_rct = diagoffc / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -304,7 +345,112 @@ void PASTEMAC(ch,varname) \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the remaining triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the triangular region
by the number of iterations used for the rectangular region. */ \
jr_start += n_iter_rct; \
jr_end += n_iter_rct; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
@@ -317,12 +463,12 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -229,7 +230,9 @@ void PASTEMAC(ch,varname) \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero. */ \
and treat this case as if the diagonal offset were zero.
NOTE: It's possible that after this pruning that the diagonal offset
is still positive (though it is guaranteed to be less than NR). */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
@@ -279,17 +282,57 @@ void PASTEMAC(ch,varname) \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in C. A non-zero remainder means we need to
add one additional iteration. That is, we want the triangular region
to contain as few columns of whole microtiles as possible while still
including all microtiles that intersect the diagonal. The number of
iterations in the rectangular region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use interleaved (round robin) assignment of micropanels to threads in the
2nd loop for the initial triangular region of C (if it exists). For both
the rectangular and triangular regions, use contiguous assignment for the
1st loop. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -304,7 +347,7 @@ void PASTEMAC(ch,varname) \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
@@ -317,12 +360,12 @@ void PASTEMAC(ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -402,6 +445,111 @@ void PASTEMAC(ch,varname) \
} \
} \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the remaining triangular region of C. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )

View File

@@ -0,0 +1,420 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
void bli_herk_l_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd and 1st loops. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )

View File

@@ -0,0 +1,409 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_l_ker_var2);
void bli_herk_l_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_l_ker_var2 )

View File

@@ -0,0 +1,420 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
void bli_herk_u_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero. */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd and 1st loops. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )

View File

@@ -0,0 +1,409 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T herk_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,herk_u_ker_var2);
void bli_herk_u_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero. */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_herk_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_herk_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( herk_u_ker_var2 )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -85,6 +86,10 @@ void bli_trmm_front
}
#if 0
// NOTE: This case casts right-side trmm in terms of left side. This
// reduces the number of macrokernels exercised to two (trmm_ll and
// trmm_lu) but can lead to the microkernel being executed with an
// output matrix that is stored counter to its output preference.
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
@@ -98,6 +103,11 @@ void bli_trmm_front
}
#else
// NOTE: This case computes right-side trmm natively with trmm_rl and
// trmm_ru macrokernels. This code path always gives us the opportunity
// to transpose the entire operation so that the effective storage format
// of the output matrix matches the microkernel's output preference.
// Thus, from a performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -317,29 +318,45 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -369,7 +386,8 @@ void PASTEMAC(ch,varname) \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
@@ -379,7 +397,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -433,13 +451,13 @@ void PASTEMAC(ch,varname) \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
@@ -449,7 +467,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -498,17 +516,13 @@ void PASTEMAC(ch,varname) \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -324,29 +325,45 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
if ( bli_trmm_l_jr_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -376,7 +393,7 @@ void PASTEMAC(ch,varname) \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
@@ -386,7 +403,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -440,13 +457,13 @@ void PASTEMAC(ch,varname) \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_l_ir_my_iter( i, ir_thread ) ) { \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
@@ -456,7 +473,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -505,17 +522,13 @@ void PASTEMAC(ch,varname) \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -324,15 +325,151 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of B (if it exists). For both the
rectangular and triangular regions, use contiguous assignment for the
1st loop as well. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr loop but skip all but the pointer increment for iterations
that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -358,7 +495,6 @@ void PASTEMAC(ch,varname) \
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
@@ -366,7 +502,7 @@ void PASTEMAC(ch,varname) \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
@@ -375,7 +511,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
@@ -390,7 +526,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -449,83 +585,6 @@ void PASTEMAC(ch,varname) \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -151,7 +152,7 @@ void PASTEMAC(ch,varname) \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
@@ -196,7 +197,7 @@ void PASTEMAC(ch,varname) \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
@@ -324,16 +325,58 @@ void PASTEMAC(ch,varname) \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use interleaved (round robin) assignment of micropanels to threads in
the 2nd loop for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr loop but skip all but the pointer increment for iterations
that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -358,7 +401,6 @@ void PASTEMAC(ch,varname) \
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
@@ -366,7 +408,7 @@ void PASTEMAC(ch,varname) \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
@@ -375,7 +417,7 @@ void PASTEMAC(ch,varname) \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
@@ -390,7 +432,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -449,30 +491,72 @@ void PASTEMAC(ch,varname) \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_r_jr_my_iter( j, jr_thread ) ) { \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use contiguous assignment of micropanels to threads in both the 2nd and
1st loops the remaining triangular region of B. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
if ( bli_trmm_r_ir_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -517,19 +601,12 @@ void PASTEMAC(ch,varname) \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}

View File

@@ -0,0 +1,519 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )

View File

@@ -0,0 +1,527 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )

View File

@@ -0,0 +1,539 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )

View File

@@ -0,0 +1,539 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var1
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_mdim
bli_thread_range_mdim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -60,7 +61,7 @@ void bli_trsm_blk_var2
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_get_range_ndim
bli_thread_range_ndim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -339,25 +340,38 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
@@ -409,8 +423,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -474,8 +487,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -531,10 +543,6 @@ void PASTEMAC(ch,varname) \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -347,25 +348,38 @@ void PASTEMAC(ch,varname) \
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use contiguous assignment of micropanels to threads in the 2nd loop. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
@@ -419,8 +433,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -484,8 +497,7 @@ void PASTEMAC(ch,varname) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
@@ -541,10 +553,6 @@ void PASTEMAC(ch,varname) \
\
c11 -= rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*

View File

@@ -0,0 +1,593 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
void bli_trsm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )

View File

@@ -0,0 +1,574 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
void bli_trsm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )

View File

@@ -0,0 +1,591 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
void bli_trsm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "rl" case (right-side/lower-
triangular), it becomes upper-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t k_b11; \
dim_t k_b21; \
dim_t off_b11; \
dim_t off_b21; \
dim_t i, j, jb; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it so that
we can index to the correct place in C (corresponding to the
part of the panel of B that was packed).
NOTE: This is NOT being done to skip over "no-op" iterations,
as with the trsm_lu macro-kernel. This MUST be done for correct
execution because we use n (via n_iter) to compute diagonal and
index offsets for backwards movement through B. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( jb = 0; jb < n_iter; ++jb ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict b2; \
\
j = n_iter - 1 - jb; \
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1 + (n_iter-1)*cstep_c; \
\
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b11 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b11; \
k_b11 = NR; \
k_b21 = k_b1121 - NR; \
off_b21 = off_b11 + k_b11; \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
b11 = b1; \
/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b21, \
alpha1_cast, \
b21, \
b11, \
a12, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b21, \
alpha1_cast, \
b21, \
b11, \
a12, \
a11, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
zero, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 -= cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )

View File

@@ -0,0 +1,584 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
void bli_trsm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "ru" case (right-side/upper-
triangular), it becomes lower-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t k_b01; \
dim_t off_b01; \
dim_t off_b11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b01 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
k_b01 = k_b0111 - NR; \
off_b11 = k_b01; \
\
/* Compute the addresses of the panel B10 and the triangular
block B11. */ \
b01 = b1; \
/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A10 panel and A11 block. */ \
a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b01, \
alpha1_cast, \
b01, \
b11, \
a10, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b01, \
alpha1_cast, \
b01, \
b11, \
a10, \
a11, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
zero, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -45,7 +46,7 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
// matrix is empty. This is not strictly needed but rather a minor
// optimization, as it would prevent threads that would otherwise get
// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
// because bli_thread_get_range*() would return empty ranges, which would
// because bli_thread_range*() would return empty ranges, which would
// cause the variant's for loop from executing any iterations.
// NOTE: this should only ever execute if the primary object is
// triangular because that is the only structure type with subpartitions

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -638,6 +639,13 @@ static bool_t bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n )
!bli_is_strictly_below_diag_n( diagoff, m, n ) );
}
static bool_t bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n )
{
return ( bool_t )
( bli_is_strictly_above_diag_n( diagoff, m, n ) ||
bli_is_strictly_below_diag_n( diagoff, m, n ) );
}
static bool_t bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n )
{
return ( bool_t )
@@ -784,10 +792,14 @@ static bool_t bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left )
( i != 0 || n_left == 0 );
}
static bool_t bli_is_last_iter( dim_t i, dim_t n_iter, dim_t tid, dim_t nth )
static bool_t bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth )
{
return ( bool_t )
( i == n_iter - 1 - ( ( n_iter - tid - 1 ) % nth ) );
#ifdef BLIS_JRIR_INTERLEAVE
( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) );
#else
( i == end_iter - 1 );
#endif
}

View File

@@ -59,9 +59,35 @@ void bli_thread_finalize( void )
{
}
// -----------------------------------------------------------------------------
#if 0
void bli_thread_range_jrir
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
//#ifdef BLIS_JRIR_INTERLEAVE
#if 1
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
#else
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
*inc = 1;
#endif
}
#endif
// -----------------------------------------------------------------------------
void bli_thread_get_range_sub
void bli_thread_range_sub
(
thrinfo_t* thread,
dim_t n,
@@ -72,6 +98,9 @@ void bli_thread_get_range_sub
)
{
dim_t n_way = bli_thread_n_way( thread );
if ( n_way == 1 ) { *start = 0; *end = n; return; }
dim_t work_id = bli_thread_work_id( thread );
dim_t all_start = 0;
@@ -202,7 +231,7 @@ void bli_thread_get_range_sub
}
}
siz_t bli_thread_get_range_l2r
siz_t bli_thread_range_l2r
(
thrinfo_t* thr,
obj_t* a,
@@ -216,13 +245,13 @@ siz_t bli_thread_get_range_l2r
dim_t n = bli_obj_width_after_trans( a );
dim_t bf = bli_blksz_get_def( dt, bmult );
bli_thread_get_range_sub( thr, n, bf,
FALSE, start, end );
bli_thread_range_sub( thr, n, bf,
FALSE, start, end );
return m * ( *end - *start );
}
siz_t bli_thread_get_range_r2l
siz_t bli_thread_range_r2l
(
thrinfo_t* thr,
obj_t* a,
@@ -236,13 +265,13 @@ siz_t bli_thread_get_range_r2l
dim_t n = bli_obj_width_after_trans( a );
dim_t bf = bli_blksz_get_def( dt, bmult );
bli_thread_get_range_sub( thr, n, bf,
TRUE, start, end );
bli_thread_range_sub( thr, n, bf,
TRUE, start, end );
return m * ( *end - *start );
}
siz_t bli_thread_get_range_t2b
siz_t bli_thread_range_t2b
(
thrinfo_t* thr,
obj_t* a,
@@ -256,13 +285,13 @@ siz_t bli_thread_get_range_t2b
dim_t n = bli_obj_width_after_trans( a );
dim_t bf = bli_blksz_get_def( dt, bmult );
bli_thread_get_range_sub( thr, m, bf,
FALSE, start, end );
bli_thread_range_sub( thr, m, bf,
FALSE, start, end );
return n * ( *end - *start );
}
siz_t bli_thread_get_range_b2t
siz_t bli_thread_range_b2t
(
thrinfo_t* thr,
obj_t* a,
@@ -276,15 +305,15 @@ siz_t bli_thread_get_range_b2t
dim_t n = bli_obj_width_after_trans( a );
dim_t bf = bli_blksz_get_def( dt, bmult );
bli_thread_get_range_sub( thr, m, bf,
TRUE, start, end );
bli_thread_range_sub( thr, m, bf,
TRUE, start, end );
return n * ( *end - *start );
}
// -----------------------------------------------------------------------------
dim_t bli_thread_get_range_width_l
dim_t bli_thread_range_width_l
(
doff_t diagoff_j,
dim_t m,
@@ -495,17 +524,17 @@ siz_t bli_find_area_trap_l
// -----------------------------------------------------------------------------
siz_t bli_thread_get_range_weighted_sub
siz_t bli_thread_range_weighted_sub
(
thrinfo_t* thread,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* j_start_thr,
dim_t* j_end_thr
thrinfo_t* restrict thread,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* restrict j_start_thr,
dim_t* restrict j_end_thr
)
{
dim_t n_way = bli_thread_n_way( thread );
@@ -570,7 +599,7 @@ siz_t bli_thread_get_range_weighted_sub
// Compute the width of the jth subpartition, taking the
// current diagonal offset into account, if needed.
width_j =
bli_thread_get_range_width_l
bli_thread_range_width_l
(
diagoff_j, m, n_left,
j, n_way,
@@ -614,7 +643,7 @@ siz_t bli_thread_get_range_weighted_sub
bli_toggle_bool( &handle_edge_low );
// Compute the appropriate range for the rotated trapezoid.
area = bli_thread_get_range_weighted_sub
area = bli_thread_range_weighted_sub
(
thread, diagoff, uplo, m, n, bf,
handle_edge_low,
@@ -632,7 +661,7 @@ siz_t bli_thread_get_range_weighted_sub
return area;
}
siz_t bli_thread_get_range_mdim
siz_t bli_thread_range_mdim
(
dir_t direct,
thrinfo_t* thr,
@@ -678,20 +707,20 @@ siz_t bli_thread_get_range_mdim
if ( use_weighted )
{
if ( direct == BLIS_FWD )
return bli_thread_get_range_weighted_t2b( thr, x, bmult, start, end );
return bli_thread_range_weighted_t2b( thr, x, bmult, start, end );
else
return bli_thread_get_range_weighted_b2t( thr, x, bmult, start, end );
return bli_thread_range_weighted_b2t( thr, x, bmult, start, end );
}
else
{
if ( direct == BLIS_FWD )
return bli_thread_get_range_t2b( thr, x, bmult, start, end );
return bli_thread_range_t2b( thr, x, bmult, start, end );
else
return bli_thread_get_range_b2t( thr, x, bmult, start, end );
return bli_thread_range_b2t( thr, x, bmult, start, end );
}
}
siz_t bli_thread_get_range_ndim
siz_t bli_thread_range_ndim
(
dir_t direct,
thrinfo_t* thr,
@@ -737,20 +766,20 @@ siz_t bli_thread_get_range_ndim
if ( use_weighted )
{
if ( direct == BLIS_FWD )
return bli_thread_get_range_weighted_l2r( thr, x, bmult, start, end );
return bli_thread_range_weighted_l2r( thr, x, bmult, start, end );
else
return bli_thread_get_range_weighted_r2l( thr, x, bmult, start, end );
return bli_thread_range_weighted_r2l( thr, x, bmult, start, end );
}
else
{
if ( direct == BLIS_FWD )
return bli_thread_get_range_l2r( thr, x, bmult, start, end );
return bli_thread_range_l2r( thr, x, bmult, start, end );
else
return bli_thread_get_range_r2l( thr, x, bmult, start, end );
return bli_thread_range_r2l( thr, x, bmult, start, end );
}
}
siz_t bli_thread_get_range_weighted_l2r
siz_t bli_thread_range_weighted_l2r
(
thrinfo_t* thr,
obj_t* a,
@@ -782,7 +811,7 @@ siz_t bli_thread_get_range_weighted_l2r
}
area =
bli_thread_get_range_weighted_sub
bli_thread_range_weighted_sub
(
thr, diagoff, uplo, m, n, bf,
FALSE, start, end
@@ -790,7 +819,7 @@ siz_t bli_thread_get_range_weighted_l2r
}
else // if dense or zeros
{
area = bli_thread_get_range_l2r
area = bli_thread_range_l2r
(
thr, a, bmult,
start, end
@@ -800,7 +829,7 @@ siz_t bli_thread_get_range_weighted_l2r
return area;
}
siz_t bli_thread_get_range_weighted_r2l
siz_t bli_thread_range_weighted_r2l
(
thrinfo_t* thr,
obj_t* a,
@@ -834,7 +863,7 @@ siz_t bli_thread_get_range_weighted_r2l
bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
area =
bli_thread_get_range_weighted_sub
bli_thread_range_weighted_sub
(
thr, diagoff, uplo, m, n, bf,
TRUE, start, end
@@ -842,7 +871,7 @@ siz_t bli_thread_get_range_weighted_r2l
}
else // if dense or zeros
{
area = bli_thread_get_range_r2l
area = bli_thread_range_r2l
(
thr, a, bmult,
start, end
@@ -852,7 +881,7 @@ siz_t bli_thread_get_range_weighted_r2l
return area;
}
siz_t bli_thread_get_range_weighted_t2b
siz_t bli_thread_range_weighted_t2b
(
thrinfo_t* thr,
obj_t* a,
@@ -886,7 +915,7 @@ siz_t bli_thread_get_range_weighted_t2b
bli_reflect_about_diag( &diagoff, &uplo, &m, &n );
area =
bli_thread_get_range_weighted_sub
bli_thread_range_weighted_sub
(
thr, diagoff, uplo, m, n, bf,
FALSE, start, end
@@ -894,7 +923,7 @@ siz_t bli_thread_get_range_weighted_t2b
}
else // if dense or zeros
{
area = bli_thread_get_range_t2b
area = bli_thread_range_t2b
(
thr, a, bmult,
start, end
@@ -904,7 +933,7 @@ siz_t bli_thread_get_range_weighted_t2b
return area;
}
siz_t bli_thread_get_range_weighted_b2t
siz_t bli_thread_range_weighted_b2t
(
thrinfo_t* thr,
obj_t* a,
@@ -939,7 +968,7 @@ siz_t bli_thread_get_range_weighted_b2t
bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n );
area = bli_thread_get_range_weighted_sub
area = bli_thread_range_weighted_sub
(
thr, diagoff, uplo, m, n, bf,
TRUE, start, end
@@ -947,7 +976,7 @@ siz_t bli_thread_get_range_weighted_b2t
}
else // if dense or zeros
{
area = bli_thread_get_range_b2t
area = bli_thread_range_b2t
(
thr, a, bmult,
start, end

View File

@@ -6,6 +6,7 @@
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -56,7 +57,21 @@ void bli_thread_finalize( void );
#endif
// Thread range-related prototypes.
void bli_thread_get_range_sub
#if 0
void bli_thread_range_jrir
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
);
#endif
// -----------------------------------------------------------------------------
void bli_thread_range_sub
(
thrinfo_t* thread,
dim_t n,
@@ -82,8 +97,8 @@ siz_t PASTEMAC0( opname ) \
dim_t* end \
);
GENPROT( thread_get_range_mdim )
GENPROT( thread_get_range_ndim )
GENPROT( thread_range_mdim )
GENPROT( thread_range_ndim )
#undef GENPROT
#define GENPROT( opname ) \
@@ -97,18 +112,18 @@ siz_t PASTEMAC0( opname ) \
dim_t* end \
);
GENPROT( thread_get_range_l2r )
GENPROT( thread_get_range_r2l )
GENPROT( thread_get_range_t2b )
GENPROT( thread_get_range_b2t )
GENPROT( thread_range_l2r )
GENPROT( thread_range_r2l )
GENPROT( thread_range_t2b )
GENPROT( thread_range_b2t )
GENPROT( thread_get_range_weighted_l2r )
GENPROT( thread_get_range_weighted_r2l )
GENPROT( thread_get_range_weighted_t2b )
GENPROT( thread_get_range_weighted_b2t )
GENPROT( thread_range_weighted_l2r )
GENPROT( thread_range_weighted_r2l )
GENPROT( thread_range_weighted_t2b )
GENPROT( thread_range_weighted_b2t )
dim_t bli_thread_get_range_width_l
dim_t bli_thread_range_width_l
(
doff_t diagoff_j,
dim_t m,
@@ -126,17 +141,17 @@ siz_t bli_find_area_trap_l
dim_t n,
doff_t diagoff
);
siz_t bli_thread_get_range_weighted_sub
siz_t bli_thread_range_weighted_sub
(
thrinfo_t* thread,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* j_start_thr,
dim_t* j_end_thr
thrinfo_t* restrict thread,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* restrict j_start_thr,
dim_t* restrict j_end_thr
);
@@ -215,5 +230,112 @@ void bli_thread_init_rntm( rntm_t* rntm );
void bli_thread_init_rntm_from_env( rntm_t* rntm );
// -----------------------------------------------------------------------------
//printf( "bli_thread_range_jrir: inlv: th%d: start end inc: %d %d %d\n", (int)bli_thread_work_id( thread ), (int)*start, (int)*end, (int)*inc );
static void bli_thread_range_jrir_rr
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
}
static void bli_thread_range_jrir_sl
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
// Use contiguous slab partitioning of jr/ir loops.
bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end );
*inc = 1;
}
static void bli_thread_range_jrir
(
thrinfo_t* thread,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
//#ifdef BLIS_JRIR_INTERLEAVE
#if 0
bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc );
#else
bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc );
#endif
}
static void bli_thread_range_weighted_jrir
(
thrinfo_t* thread,
doff_t diagoff,
uplo_t uplo,
dim_t m,
dim_t n,
dim_t bf,
bool_t handle_edge_low,
dim_t* start,
dim_t* end,
dim_t* inc
)
{
#ifdef BLIS_JRIR_INTERLEAVE
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = bli_thread_n_way( thread );
*end = n;
#else
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
handle_edge_low, start, end );
*start = *start / bf; *inc = 1;
if ( *end % bf ) *end = *end / bf + 1;
else *end = *end / bf;
#endif
#if 0
const dim_t n_way = bli_thread_n_way( thread );
if ( m * n / n_way > 25000 )
{
// Use contiguous slab partitioning for jr/ir loops.
bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf,
handle_edge_low, start, end );
*inc = 1;
}
else
{
// Use interleaved partitioning of jr/ir loops.
*start = bli_thread_work_id( thread );
*inc = n_way; //bli_thread_n_way( thread );
*end = n;
}
#endif
}
#endif

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -46,10 +47,10 @@ void blx_gemm_int
thrinfo_t* thread
)
{
obj_t a_local;
obj_t b_local;
obj_t c_local;
gemm_voft f;
obj_t a_local;
obj_t b_local;
obj_t c_local;
gemm_var_oft f;
// Alias A, B, and C in case we need to update attached scalars.
bli_obj_alias_to( a, &a_local );

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var1
dim_t my_start, my_end;
// Determine the current thread's subpartition range.
bli_thread_get_range_mdim
bli_thread_range_mdim
(
BLIS_FWD, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -52,7 +53,7 @@ void blx_gemm_blk_var2
dim_t my_start, my_end;
// Determine the current thread's subpartition range.
bli_thread_get_range_ndim
bli_thread_range_ndim
(
BLIS_FWD, thread, a, b, c, cntl, cntx,
&my_start, &my_end

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -255,14 +256,27 @@ void PASTECH2(blx_,ch,varname) \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
@@ -277,7 +291,7 @@ void PASTECH2(blx_,ch,varname) \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
@@ -287,12 +301,12 @@ void PASTECH2(blx_,ch,varname) \
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\

View File

@@ -5,6 +5,7 @@
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2018, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -200,13 +201,13 @@ STR_ST := -DTHR_STR=\"st\"
STR_MT := -DTHR_STR=\"mt\"
# Problem size specification
PDEF_ST := -DP_BEGIN=40 \
PDEF_ST := -DP_BEGIN=96 \
-DP_END=2000 \
-DP_INC=40
-DP_INC=96
PDEF_MT := -DP_BEGIN=200 \
-DP_END=10000 \
-DP_INC=200
PDEF_MT := -DP_BEGIN=192 \
-DP_END=3000 \
-DP_INC=192
@@ -226,9 +227,6 @@ all-mt: blis-mt openblas-mt mkl-mt
blis-st: blis-gemm-st
blis-mt: blis-gemm-mt
blis-nat-st: blis-gemm-nat-st
blis-nat-mt: blis-gemm-nat-mt
openblas-st: openblas-gemm-st
openblas-mt: openblas-gemm-mt
@@ -240,6 +238,42 @@ blis-gemm-st: blis-gemm-nat-st \
blis-gemm-mt: blis-gemm-nat-mt \
blis-gemm-ind-mt
blis-nat-st: \
test_sgemm_asm_blis_st.x \
test_dgemm_asm_blis_st.x \
test_cgemm_asm_blis_st.x \
test_zgemm_asm_blis_st.x \
test_sherk_asm_blis_st.x \
test_dherk_asm_blis_st.x \
test_cherk_asm_blis_st.x \
test_zherk_asm_blis_st.x \
test_strmm_asm_blis_st.x \
test_dtrmm_asm_blis_st.x \
test_ctrmm_asm_blis_st.x \
test_ztrmm_asm_blis_st.x \
test_strsm_asm_blis_st.x \
test_dtrsm_asm_blis_st.x \
test_ctrsm_asm_blis_st.x \
test_ztrsm_asm_blis_st.x
blis-nat-mt: \
test_sgemm_asm_blis_mt.x \
test_dgemm_asm_blis_mt.x \
test_cgemm_asm_blis_mt.x \
test_zgemm_asm_blis_mt.x \
test_sherk_asm_blis_mt.x \
test_dherk_asm_blis_mt.x \
test_cherk_asm_blis_mt.x \
test_zherk_asm_blis_mt.x \
test_strmm_asm_blis_mt.x \
test_dtrmm_asm_blis_mt.x \
test_ctrmm_asm_blis_mt.x \
test_ztrmm_asm_blis_mt.x \
test_strsm_asm_blis_mt.x \
test_dtrsm_asm_blis_mt.x \
test_ctrsm_asm_blis_mt.x \
test_ztrsm_asm_blis_mt.x
blis-gemm-nat-st: \
test_sgemm_asm_blis_st.x \
test_dgemm_asm_blis_st.x \
@@ -390,28 +424,28 @@ test_c%_1m_blis_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@
# blis asm
test_d%_asm_blis_st.o: test_%.c
test_d%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_s%_asm_blis_st.o: test_%.c
test_s%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_z%_asm_blis_st.o: test_%.c
test_z%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_c%_asm_blis_st.o: test_%.c
test_c%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_d%_asm_blis_mt.o: test_%.c
test_d%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_s%_asm_blis_mt.o: test_%.c
test_s%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_z%_asm_blis_mt.o: test_%.c
test_z%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_c%_asm_blis_mt.o: test_%.c
test_c%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
# openblas

314
test/3m4m/test_herk.c Normal file
View File

@@ -0,0 +1,314 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, c;
obj_t c_save;
obj_t alpha, beta;
dim_t m, k;
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, k_input;
ind_t ind;
num_t dt, dt_real;
char dt_ch;
int r, n_repeats;
uplo_t uploc;
trans_t transa;
f77_char f77_uploc;
f77_char f77_transa;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
dt_real = bli_dt_proj_to_real( DT );
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_inc = P_INC;
m_input = -1;
k_input = -1;
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
uploc = BLIS_LOWER;
transa = BLIS_NO_TRANSPOSE;
bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( k_input < 0 ) k = p / ( dim_t )abs(k_input);
else k = ( dim_t ) k_input;
bli_obj_create( dt_real, 1, 1, 0, 0, &alpha );
bli_obj_create( dt, 1, 1, 0, 0, &beta );
if ( bli_does_trans( transa ) )
bli_obj_create( dt, k, m, 0, 0, &a );
else
bli_obj_create( dt, m, k, 0, 0, &a );
bli_obj_create( dt, m, m, 0, 0, &c );
//bli_obj_create( dt, m, k, 2, 2*m, &a );
//bli_obj_create( dt, k, n, 2, 2*k, &b );
//bli_obj_create( dt, m, n, 2, 2*m, &c );
bli_obj_create( dt, m, m, 0, 0, &c_save );
bli_randm( &a );
bli_randm( &c );
bli_obj_set_struc( BLIS_HERMITIAN, &c );
bli_obj_set_uplo( uploc, &c );
bli_obj_set_conjtrans( transa, &a );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#ifdef BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_herk( &alpha,
&a,
&beta,
&c );
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
ssyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dsyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
gflops = ( 1.0 * m * k * m ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,
( unsigned long )k, gflops );
bli_obj_free( &alpha );
bli_obj_free( &beta );
bli_obj_free( &a );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

328
test/3m4m/test_trmm.c Normal file
View File

@@ -0,0 +1,328 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, c;
obj_t c_save;
obj_t alpha;
dim_t m, n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, n_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
side_t side;
uplo_t uploa;
trans_t transa;
diag_t diaga;
f77_char f77_side;
f77_char f77_uploa;
f77_char f77_transa;
f77_char f77_diaga;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_inc = P_INC;
m_input = -1;
n_input = -1;
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
#if 0
side = BLIS_LEFT;
#else
side = BLIS_RIGHT;
#endif
#if 0
uploa = BLIS_LOWER;
#else
uploa = BLIS_UPPER;
#endif
transa = BLIS_NO_TRANSPOSE;
diaga = BLIS_NONUNIT_DIAG;
bli_param_map_blis_to_netlib_side( side, &f77_side );
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
if ( bli_does_trans( side ) )
bli_obj_create( dt, m, m, 0, 0, &a );
else
bli_obj_create( dt, n, n, 0, 0, &a );
bli_obj_create( dt, m, n, 0, 0, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
bli_randm( &a );
bli_randm( &c );
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
bli_obj_set_uplo( uploa, &a );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_diag( diaga, &a );
bli_randm( &a );
bli_mktrim( &a );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_copym( &c, &c_save );
#ifdef BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_trmm( side,
&alpha,
&a,
&c );
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
strmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
dtrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
ctrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ztrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
if ( bli_is_left( side ) )
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
else
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &a );
bli_obj_free( &c );
bli_obj_free( &c_save );
}
//bli_finalize();
return 0;
}

338
test/3m4m/test_trsm.c Normal file
View File

@@ -0,0 +1,338 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include "blis.h"
//#define PRINT
int main( int argc, char** argv )
{
obj_t a, c, d;
obj_t c_save;
obj_t alpha;
dim_t m, n;
dim_t p;
dim_t p_begin, p_end, p_inc;
int m_input, n_input;
ind_t ind;
num_t dt;
char dt_ch;
int r, n_repeats;
side_t side;
uplo_t uploa;
trans_t transa;
diag_t diaga;
f77_char f77_side;
f77_char f77_uploa;
f77_char f77_transa;
f77_char f77_diaga;
double dtime;
double dtime_save;
double gflops;
//bli_init();
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
n_repeats = 3;
dt = DT;
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_inc = P_INC;
m_input = -1;
n_input = -1;
// Supress compiler warnings about unused variable 'ind'.
( void )ind;
#if 0
cntx_t* cntx;
ind_t ind_mod = ind;
// A hack to use 3m1 as 1mpb (with 1m as 1mbp).
if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M;
// Initialize a context for the current induced method and datatype.
cntx = bli_gks_query_ind_cntx( ind_mod, dt );
// Set k to the kc blocksize for the current datatype.
k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx );
#elif 1
//k_input = 256;
#endif
// Choose the char corresponding to the requested datatype.
if ( bli_is_float( dt ) ) dt_ch = 's';
else if ( bli_is_double( dt ) ) dt_ch = 'd';
else if ( bli_is_scomplex( dt ) ) dt_ch = 'c';
else dt_ch = 'z';
#if 0
side = BLIS_LEFT;
#else
side = BLIS_RIGHT;
#endif
#if 0
uploa = BLIS_LOWER;
#else
uploa = BLIS_UPPER;
#endif
transa = BLIS_NO_TRANSPOSE;
diaga = BLIS_NONUNIT_DIAG;
bli_param_map_blis_to_netlib_side( side, &f77_side );
bli_param_map_blis_to_netlib_uplo( uploa, &f77_uploa );
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
bli_param_map_blis_to_netlib_diag( diaga, &f77_diaga );
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
else m = ( dim_t ) m_input;
if ( n_input < 0 ) n = p / ( dim_t )abs(n_input);
else n = ( dim_t ) n_input;
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
if ( bli_does_trans( side ) )
bli_obj_create( dt, m, m, 0, 0, &a );
else
bli_obj_create( dt, n, n, 0, 0, &a );
bli_obj_create( dt, m, n, 0, 0, &c );
//bli_obj_create( dt, m, n, n, 1, &c );
bli_obj_create( dt, m, n, 0, 0, &c_save );
if ( bli_does_trans( side ) )
bli_obj_create( dt, m, m, 0, 0, &d );
else
bli_obj_create( dt, n, n, 0, 0, &d );
bli_randm( &a );
bli_randm( &c );
bli_obj_set_struc( BLIS_TRIANGULAR, &a );
bli_obj_set_uplo( uploa, &a );
bli_obj_set_conjtrans( transa, &a );
bli_obj_set_diag( diaga, &a );
bli_randm( &a );
bli_mktrim( &a );
bli_setd( &BLIS_TWO, &d );
bli_addd( &d, &a );
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_copym( &c, &c_save );
#ifdef BLIS
bli_ind_disable_all_dt( dt );
bli_ind_enable_dt( ind, dt );
#endif
dtime_save = DBL_MAX;
for ( r = 0; r < n_repeats; ++r )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
#endif
#ifdef BLIS
bli_trsm( side,
&alpha,
&a,
&c );
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
strsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
dtrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
ctrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ztrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
#endif
#ifdef PRINT
bli_printm( "c after", &c, "%4.1f", "" );
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
if ( bli_is_left( side ) )
gflops = ( 1.0 * m * m * n ) / ( dtime_save * 1.0e9 );
else
gflops = ( 1.0 * m * n * n ) / ( dtime_save * 1.0e9 );
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
#endif
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,
( unsigned long )n, gflops );
bli_obj_free( &alpha );
bli_obj_free( &a );
bli_obj_free( &c );
bli_obj_free( &c_save );
bli_obj_free( &d );
}
//bli_finalize();
return 0;
}

View File

@@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -290,13 +291,13 @@ int main( int argc, char** argv )
thrinfo.work_id = t;
if ( part_n_dim && go_fwd )
area = bli_thread_get_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
area = bli_thread_range_weighted_l2r( &thrinfo, &a, &bfs, &start, &end );
else if ( part_n_dim && go_bwd )
area = bli_thread_get_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
area = bli_thread_range_weighted_r2l( &thrinfo, &a, &bfs, &start, &end );
else if ( part_m_dim && go_fwd )
area = bli_thread_get_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
area = bli_thread_range_weighted_t2b( &thrinfo, &a, &bfs, &start, &end );
else // ( part_m_dim && go_bwd )
area = bli_thread_get_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
area = bli_thread_range_weighted_b2t( &thrinfo, &a, &bfs, &start, &end );
width = end - start;

View File

@@ -1797,19 +1797,19 @@ bli_thread_get_jc_nt
bli_thread_get_jr_nt
bli_thread_get_num_threads
bli_thread_get_pc_nt
bli_thread_get_range_b2t
bli_thread_get_range_l2r
bli_thread_get_range_mdim
bli_thread_get_range_ndim
bli_thread_get_range_r2l
bli_thread_get_range_sub
bli_thread_get_range_t2b
bli_thread_get_range_weighted_b2t
bli_thread_get_range_weighted_l2r
bli_thread_get_range_weighted_r2l
bli_thread_get_range_weighted_sub
bli_thread_get_range_weighted_t2b
bli_thread_get_range_width_l
bli_thread_range_b2t
bli_thread_range_l2r
bli_thread_range_mdim
bli_thread_range_ndim
bli_thread_range_r2l
bli_thread_range_sub
bli_thread_range_t2b
bli_thread_range_weighted_b2t
bli_thread_range_weighted_l2r
bli_thread_range_weighted_r2l
bli_thread_range_weighted_sub
bli_thread_range_weighted_t2b
bli_thread_range_width_l
bli_thread_init
bli_thread_init_rntm
bli_thread_init_rntm_from_env