mirror of
https://github.com/amd/blis.git
synced 2026-04-20 07:38:53 +00:00
Added level-3 support for complex via 4m-/3m.
Details: - Added the ability to induce complex domain level-3 operations via new virtual complex micro-kernels which are implemented via only real domain micro-kernels. Two new implementations are provided: 4m and 3m. 4m implements complex matrix multiplication in terms of four real matrix multiplications, where as 3m uses only three and thus is capable of even higher (than peak) performance. However, the 3m method has somewhat weaker numerical properties, making it less desirable in general. - Further refined packing routines, which were recently revamped, and added packing functionality for 4m and 3m. - Some modifications to trmm and trsm macro-kernels to facilitate indexing into micro-panels which were packed for 4m/3m virtual kernels. - Added 4m and 3m interfaces for each level-3 operation. - Various other minor changes to facilitate 4m/3m methods.
This commit is contained in:
@@ -64,6 +64,14 @@
|
||||
// scomplex and dcomplex being defined in terms of simple structs.
|
||||
//#define BLIS_ENABLE_C99_COMPLEX
|
||||
|
||||
// Enable use of the 4m method to implement complex domain level-3
|
||||
// operations. By enabling this option, special code is activiated that
|
||||
// induces complex level-3 operations using ONLY the real domain
|
||||
// micro-kernels. This allows kernel authors to focus on optimizing
|
||||
// the real micro-kernels, which can then be leveraged to provide their
|
||||
// complex counterparts "for free".
|
||||
#define BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
|
||||
|
||||
|
||||
// -- MULTITHREADING -----------------------------------------------------------
|
||||
|
||||
@@ -55,20 +55,40 @@
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 768
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#define BLIS_DEFAULT_KC_S 384
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 384
|
||||
#define BLIS_DEFAULT_KC_D 384
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 128
|
||||
#define BLIS_DEFAULT_KC_C 256
|
||||
#define BLIS_DEFAULT_MC_C 384
|
||||
#define BLIS_DEFAULT_KC_C 384
|
||||
#define BLIS_DEFAULT_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_Z 64
|
||||
#define BLIS_DEFAULT_KC_Z 256
|
||||
#define BLIS_DEFAULT_NC_Z 2048
|
||||
#define BLIS_DEFAULT_MC_Z 192
|
||||
#define BLIS_DEFAULT_KC_Z 384
|
||||
#define BLIS_DEFAULT_NC_Z 4096
|
||||
|
||||
// NOTE: If 4m blocksizes are not defined here, they will be determined
|
||||
// from the corresponding real domain blocksizes.
|
||||
#define BLIS_DEFAULT_4M_MC_C 384
|
||||
#define BLIS_DEFAULT_4M_KC_C 512
|
||||
#define BLIS_DEFAULT_4M_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_4M_MC_Z 192
|
||||
#define BLIS_DEFAULT_4M_KC_Z 256
|
||||
#define BLIS_DEFAULT_4M_NC_Z 4096
|
||||
|
||||
// NOTE: If 3m blocksizes are not defined here, they will be determined
|
||||
// from the corresponding real domain blocksizes.
|
||||
#define BLIS_DEFAULT_3M_MC_C 384
|
||||
#define BLIS_DEFAULT_3M_KC_C 512
|
||||
#define BLIS_DEFAULT_3M_NC_C 4096
|
||||
|
||||
#define BLIS_DEFAULT_3M_MC_Z 192
|
||||
#define BLIS_DEFAULT_3M_KC_Z 256
|
||||
#define BLIS_DEFAULT_3M_NC_Z 4096
|
||||
|
||||
// -- Cache blocksize extensions (for optimizing edge cases) --
|
||||
|
||||
@@ -107,10 +127,10 @@
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
#define BLIS_DEFAULT_NR_C 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_Z 4
|
||||
#define BLIS_DEFAULT_NR_Z 4
|
||||
#define BLIS_DEFAULT_MR_Z 2
|
||||
#define BLIS_DEFAULT_NR_Z 2
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
@@ -238,10 +258,10 @@
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4
|
||||
//#define GEMMTRSM_L_UKERNEL gemmtrsm_l_ref_mxn
|
||||
//#define GEMMTRSM_U_UKERNEL gemmtrsm_u_ref_mxn
|
||||
#define GEMMTRSM_L_UKERNEL gemmtrsm_l_opt_d4x4
|
||||
#define GEMMTRSM_U_UKERNEL gemmtrsm_u_opt_d4x4
|
||||
|
||||
//#define TRSM_L_UKERNEL trsm_l_ref_4x4
|
||||
//#define TRSM_U_UKERNEL trsm_u_ref_4x4
|
||||
|
||||
@@ -83,7 +83,7 @@ CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -mfpmath=sse #-fomit-frame-pointer
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
CKOPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
CVECFLAGS := -msse3 -march=native
|
||||
|
||||
|
||||
@@ -42,9 +42,13 @@
|
||||
#include "bli_packm_unb_var1.h"
|
||||
|
||||
#include "bli_packm_blk_var1.h"
|
||||
#include "bli_packm_blk_var3.h"
|
||||
#include "bli_packm_blk_var4.h"
|
||||
|
||||
#include "bli_packm_cxk.h"
|
||||
#include "bli_packm_gen_cxk.h"
|
||||
#include "bli_packm_herm_cxk.h"
|
||||
#include "bli_packm_tri_cxk.h"
|
||||
|
||||
#include "bli_packm_cxk.h"
|
||||
#include "bli_packm_cxk_ri.h"
|
||||
#include "bli_packm_cxk_ri3.h"
|
||||
|
||||
@@ -124,24 +124,24 @@ void bli_packm_blk_var1( obj_t* c,
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname )( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
@@ -353,8 +353,7 @@ void PASTEMAC(ch,varname )( \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: p_inc should be set to ps_p to properly support
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else \
|
||||
@@ -378,8 +377,7 @@ void PASTEMAC(ch,varname )( \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: p_inc should be set to ps_p to properly support
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE. */ \
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
|
||||
446
frame/1m/packm/bli_packm_blk_var3.c
Normal file
446
frame/1m/packm/bli_packm_blk_var3.c
Normal file
@@ -0,0 +1,446 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var3);
|
||||
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
obj_t* p )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( *c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( *c );
|
||||
diag_t diagc = bli_obj_diag( *c );
|
||||
uplo_t uploc = bli_obj_uplo( *c );
|
||||
trans_t transc = bli_obj_conjtrans_status( *c );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( *p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( *p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p );
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
inc_t cs_p = bli_obj_col_stride( *p );
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// We want this variant to behave identically to that of variant 1
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p );
|
||||
return;
|
||||
}
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing complex domain micro-kernels in terms of their
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
//f = ftypes[dt_cp];
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var3;
|
||||
else f = bli_zpackm_blk_var3;
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t num_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( rs_c, cs_c ); \
|
||||
bli_negate_diag_offset( diagoffc ); \
|
||||
bli_toggle_uplo( uploc ); \
|
||||
bli_toggle_trans( transc ); \
|
||||
} \
|
||||
\
|
||||
/* If the strides of P indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (num_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = num_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
PASTEMAC(ch,packm_tri_cxk_ri3)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
\
|
||||
\
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
\
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_herm_cxk_ri3)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_gen_cxk_ri3)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ( ldp * panel_len_max_i * 3 ) / 2; \
|
||||
\
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var3: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + (p_inc*2)/3, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_blk_var3 )
|
||||
|
||||
62
frame/1m/packm/bli_packm_blk_var3.h
Normal file
62
frame/1m/packm/bli_packm_blk_var3.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var3( obj_t* c,
|
||||
obj_t* p );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var3 )
|
||||
|
||||
461
frame/1m/packm/bli_packm_blk_var4.c
Normal file
461
frame/1m/packm/bli_packm_blk_var4.c
Normal file
@@ -0,0 +1,461 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
struc_t strucc,
|
||||
doff_t diagoffc,
|
||||
diag_t diagc,
|
||||
uplo_t uploc,
|
||||
trans_t transc,
|
||||
bool_t invdiag,
|
||||
bool_t revifup,
|
||||
bool_t reviflo,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t m_max,
|
||||
dim_t n_max,
|
||||
void* kappa,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
void* p, inc_t rs_p, inc_t cs_p,
|
||||
dim_t pd_p, inc_t ps_p
|
||||
);
|
||||
|
||||
//static FUNCPTR_T GENARRAY(ftypes,packm_blk_var4);
|
||||
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* p )
|
||||
{
|
||||
num_t dt_cp = bli_obj_datatype( *c );
|
||||
|
||||
struc_t strucc = bli_obj_struc( *c );
|
||||
doff_t diagoffc = bli_obj_diag_offset( *c );
|
||||
diag_t diagc = bli_obj_diag( *c );
|
||||
uplo_t uploc = bli_obj_uplo( *c );
|
||||
trans_t transc = bli_obj_conjtrans_status( *c );
|
||||
bool_t invdiag = bli_obj_has_inverted_diag( *p );
|
||||
bool_t revifup = bli_obj_is_pack_rev_if_upper( *p );
|
||||
bool_t reviflo = bli_obj_is_pack_rev_if_lower( *p );
|
||||
|
||||
dim_t m_p = bli_obj_length( *p );
|
||||
dim_t n_p = bli_obj_width( *p );
|
||||
dim_t m_max_p = bli_obj_padded_length( *p );
|
||||
dim_t n_max_p = bli_obj_padded_width( *p );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( *c );
|
||||
inc_t rs_c = bli_obj_row_stride( *c );
|
||||
inc_t cs_c = bli_obj_col_stride( *c );
|
||||
|
||||
void* buf_p = bli_obj_buffer_at_off( *p );
|
||||
inc_t rs_p = bli_obj_row_stride( *p );
|
||||
inc_t cs_p = bli_obj_col_stride( *p );
|
||||
dim_t pd_p = bli_obj_panel_dim( *p );
|
||||
inc_t ps_p = bli_obj_panel_stride( *p );
|
||||
|
||||
obj_t kappa;
|
||||
obj_t* kappa_p;
|
||||
void* buf_kappa;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
|
||||
// We want this variant to behave identically to that of variant 1
|
||||
// in the real domain.
|
||||
if ( bli_is_real( dt_cp ) )
|
||||
{
|
||||
bli_packm_blk_var1( c, p );
|
||||
return;
|
||||
}
|
||||
|
||||
// The value for kappa we use will depend on whether the scalar
|
||||
// attached to A has a nonzero imaginary component. If it does,
|
||||
// then we will apply the scalar during packing to facilitate
|
||||
// implementing complex domain micro-kernels in terms of their
|
||||
// real domain counterparts. (In the aforementioned situation,
|
||||
// applying a real scalar is easy, but applying a complex one is
|
||||
// harder, so we avoid the need altogether with the code below.)
|
||||
if ( bli_obj_scalar_has_nonzero_imag( p ) )
|
||||
{
|
||||
// Detach the scalar.
|
||||
bli_obj_scalar_detach( p, &kappa );
|
||||
|
||||
// Reset the attached scalar (to 1.0).
|
||||
bli_obj_scalar_reset( p );
|
||||
|
||||
kappa_p = κ
|
||||
}
|
||||
else
|
||||
{
|
||||
// If the internal scalar of A has only a real component, then
|
||||
// we will apply it later (in the micro-kernel), and so we will
|
||||
// use BLIS_ONE to indicate no scaling during packing.
|
||||
kappa_p = &BLIS_ONE;
|
||||
}
|
||||
|
||||
|
||||
// Acquire the buffer to the kappa chosen above.
|
||||
buf_kappa = bli_obj_buffer_for_1x1( dt_cp, *kappa_p );
|
||||
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
//f = ftypes[dt_cp];
|
||||
if ( bli_is_scomplex( dt_cp ) ) f = bli_cpackm_blk_var4;
|
||||
else f = bli_zpackm_blk_var4;
|
||||
|
||||
// Invoke the function.
|
||||
f( strucc,
|
||||
diagoffc,
|
||||
diagc,
|
||||
uploc,
|
||||
transc,
|
||||
invdiag,
|
||||
revifup,
|
||||
reviflo,
|
||||
m_p,
|
||||
n_p,
|
||||
m_max_p,
|
||||
n_max_p,
|
||||
buf_kappa,
|
||||
buf_c, rs_c, cs_c,
|
||||
buf_p, rs_p, cs_p,
|
||||
pd_p, ps_p );
|
||||
}
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
) \
|
||||
{ \
|
||||
ctype* restrict kappa_cast = kappa; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict p_cast = p; \
|
||||
ctype* restrict c_begin; \
|
||||
ctype* restrict p_begin; \
|
||||
\
|
||||
dim_t iter_dim; \
|
||||
dim_t num_iter; \
|
||||
dim_t it, ic, ip; \
|
||||
dim_t ic0, ip0; \
|
||||
doff_t ic_inc, ip_inc; \
|
||||
doff_t diagoffc_i; \
|
||||
doff_t diagoffc_inc; \
|
||||
dim_t panel_len_full; \
|
||||
dim_t panel_len_i; \
|
||||
dim_t panel_len_max; \
|
||||
dim_t panel_len_max_i; \
|
||||
dim_t panel_dim_i; \
|
||||
dim_t panel_dim_max; \
|
||||
dim_t panel_off_i; \
|
||||
inc_t vs_c; \
|
||||
inc_t ldc; \
|
||||
inc_t ldp, p_inc; \
|
||||
dim_t* m_panel_full; \
|
||||
dim_t* n_panel_full; \
|
||||
dim_t* m_panel_use; \
|
||||
dim_t* n_panel_use; \
|
||||
dim_t* m_panel_max; \
|
||||
dim_t* n_panel_max; \
|
||||
conj_t conjc; \
|
||||
\
|
||||
ctype* restrict c_use; \
|
||||
ctype* restrict p_use; \
|
||||
doff_t diagoffp_i; \
|
||||
\
|
||||
\
|
||||
/* If C is zeros and part of a triangular matrix, then we don't need
|
||||
to pack it. */ \
|
||||
if ( bli_is_zeros( uploc ) && \
|
||||
bli_is_triangular( strucc ) ) return; \
|
||||
\
|
||||
/* Extract the conjugation bit from the transposition argument. */ \
|
||||
conjc = bli_extract_conj( transc ); \
|
||||
\
|
||||
/* If c needs a transposition, induce it so that we can more simply
|
||||
express the remaining parameters and code. */ \
|
||||
if ( bli_does_trans( transc ) ) \
|
||||
{ \
|
||||
bli_swap_incs( rs_c, cs_c ); \
|
||||
bli_negate_diag_offset( diagoffc ); \
|
||||
bli_toggle_uplo( uploc ); \
|
||||
bli_toggle_trans( transc ); \
|
||||
} \
|
||||
\
|
||||
/* If the strides of P indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panels. */ \
|
||||
iter_dim = n; \
|
||||
panel_len_full = m; \
|
||||
panel_len_max = m_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = rs_c; \
|
||||
vs_c = cs_c; \
|
||||
diagoffc_inc = -( doff_t)panel_dim_max; \
|
||||
ldp = rs_p; \
|
||||
m_panel_full = &m; \
|
||||
n_panel_full = &panel_dim_i; \
|
||||
m_panel_use = &panel_len_i; \
|
||||
n_panel_use = &panel_dim_i; \
|
||||
m_panel_max = &panel_len_max_i; \
|
||||
n_panel_max = &panel_dim_max; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panels. */ \
|
||||
iter_dim = m; \
|
||||
panel_len_full = n; \
|
||||
panel_len_max = n_max; \
|
||||
panel_dim_max = pd_p; \
|
||||
ldc = cs_c; \
|
||||
vs_c = rs_c; \
|
||||
diagoffc_inc = ( doff_t )panel_dim_max; \
|
||||
ldp = cs_p; \
|
||||
m_panel_full = &panel_dim_i; \
|
||||
n_panel_full = &n; \
|
||||
m_panel_use = &panel_dim_i; \
|
||||
n_panel_use = &panel_len_i; \
|
||||
m_panel_max = &panel_dim_max; \
|
||||
n_panel_max = &panel_len_max_i; \
|
||||
} \
|
||||
\
|
||||
/* Compute the total number of iterations we'll need. */ \
|
||||
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
|
||||
\
|
||||
/* Set the initial values and increments for indices related to C and P
|
||||
based on whether reverse iteration was requested. */ \
|
||||
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || \
|
||||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) \
|
||||
{ \
|
||||
ic0 = (num_iter - 1) * panel_dim_max; \
|
||||
ic_inc = -panel_dim_max; \
|
||||
ip0 = num_iter - 1; \
|
||||
ip_inc = -1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ic0 = 0; \
|
||||
ic_inc = panel_dim_max; \
|
||||
ip0 = 0; \
|
||||
ip_inc = 1; \
|
||||
} \
|
||||
\
|
||||
p_begin = p_cast; \
|
||||
\
|
||||
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
|
||||
ic += ic_inc, ip += ip_inc, it += 1 ) \
|
||||
{ \
|
||||
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
|
||||
\
|
||||
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
|
||||
c_begin = c_cast + (ic )*vs_c; \
|
||||
\
|
||||
if ( bli_is_triangular( strucc ) && \
|
||||
bli_is_unstored_subpart_n( diagoffc_i, uploc, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is completely unstored (ie: zero). If the panel
|
||||
is unstored, we do nothing. (Notice that we don't even
|
||||
increment p_begin.) */ \
|
||||
\
|
||||
continue; \
|
||||
} \
|
||||
else if ( bli_is_triangular( strucc ) && \
|
||||
bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a triangular
|
||||
matrix AND is diagonal-intersecting. Notice that we
|
||||
cannot bury the following conditional logic into
|
||||
packm_tri_cxk() because we need to know the value of
|
||||
panel_len_max_i so we can properly increment p_inc. */ \
|
||||
\
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc_i < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc_i > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
panel_off_i = 0; \
|
||||
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; \
|
||||
panel_len_max_i = bli_abs( diagoffc_i ) + panel_dim_max; \
|
||||
diagoffp_i = diagoffc_i; \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
panel_off_i = bli_abs( diagoffc_i ); \
|
||||
panel_len_i = panel_len_full - panel_off_i; \
|
||||
panel_len_max_i = panel_len_max - panel_off_i; \
|
||||
diagoffp_i = 0; \
|
||||
} \
|
||||
\
|
||||
c_use = c_begin + (panel_off_i )*ldc; \
|
||||
p_use = p_begin; \
|
||||
\
|
||||
PASTEMAC(ch,packm_tri_cxk_ri)( strucc, \
|
||||
diagoffp_i, \
|
||||
diagc, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
invdiag, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_use, rs_c, cs_c, \
|
||||
p_use, rs_p, cs_p ); \
|
||||
\
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/*
|
||||
if ( rs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
else if ( bli_is_herm_or_symm( strucc ) ) \
|
||||
{ \
|
||||
/* This case executes if the panel belongs to a Hermitian or
|
||||
symmetric matrix, which includes stored, unstored, and
|
||||
diagonal-intersecting panels. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_herm_cxk_ri)( strucc, \
|
||||
diagoffc_i, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* This case executes if the panel is general, or, if the
|
||||
panel is part of a triangular matrix and is neither unstored
|
||||
(ie: zero) nor diagonal-intersecting. */ \
|
||||
\
|
||||
panel_len_i = panel_len_full; \
|
||||
panel_len_max_i = panel_len_max; \
|
||||
\
|
||||
PASTEMAC(ch,packm_gen_cxk_ri)( BLIS_GENERAL, \
|
||||
0, \
|
||||
BLIS_DENSE, \
|
||||
conjc, \
|
||||
*m_panel_use, \
|
||||
*n_panel_use, \
|
||||
*m_panel_max, \
|
||||
*n_panel_max, \
|
||||
kappa_cast, \
|
||||
c_begin, rs_c, cs_c, \
|
||||
p_begin, rs_p, cs_p ); \
|
||||
\
|
||||
/* NOTE: This value is equivalent to ps_p. */ \
|
||||
p_inc = ldp * panel_len_max_i; \
|
||||
\
|
||||
/*
|
||||
if ( cs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: bp_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
/*
|
||||
if ( rs_p == 1 ) { \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_r", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_var4: ap_i", *m_panel_max, *n_panel_max, \
|
||||
( ctype_r* )p_begin + p_inc, rs_p, cs_p, "%4.1f", "" ); \
|
||||
} \
|
||||
*/ \
|
||||
\
|
||||
} \
|
||||
\
|
||||
p_begin += p_inc; \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_blk_var4 )
|
||||
|
||||
62
frame/1m/packm/bli_packm_blk_var4.h
Normal file
62
frame/1m/packm/bli_packm_blk_var4.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_packm_blk_var4( obj_t* c,
|
||||
obj_t* p );
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
trans_t transc, \
|
||||
bool_t invdiag, \
|
||||
bool_t revifup, \
|
||||
bool_t reviflo, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t m_max, \
|
||||
dim_t n_max, \
|
||||
void* kappa, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
void* p, inc_t rs_p, inc_t cs_p, \
|
||||
dim_t pd_p, inc_t ps_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_blk_var4 )
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
void* beta,
|
||||
void* kappa,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* p, inc_t ldp
|
||||
);
|
||||
@@ -169,7 +169,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t ldp \
|
||||
) \
|
||||
@@ -184,7 +184,7 @@ void PASTEMAC(ch,opname)( \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2vker)( conja, \
|
||||
n, \
|
||||
beta, \
|
||||
kappa, \
|
||||
a, lda, \
|
||||
p, ldp ); \
|
||||
return; \
|
||||
@@ -209,7 +209,7 @@ void PASTEMAC(ch,opname)( \
|
||||
{ \
|
||||
f( conja, \
|
||||
n, \
|
||||
beta, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, ldp ); \
|
||||
} \
|
||||
@@ -222,7 +222,7 @@ void PASTEMAC(ch,opname)( \
|
||||
conja, \
|
||||
m, \
|
||||
n, \
|
||||
beta, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, 1, ldp ); \
|
||||
} \
|
||||
|
||||
265
frame/1m/packm/bli_packm_cxk_ri.c
Normal file
265
frame/1m/packm/bli_packm_cxk_ri.c
Normal file
@@ -0,0 +1,265 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_cxk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
void* kappa,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* p, inc_t psp, inc_t ldp
|
||||
);
|
||||
|
||||
#undef FUNCPTR_ARRAY_LENGTH
|
||||
#define FUNCPTR_ARRAY_LENGTH 18
|
||||
|
||||
#undef GENARRAY
|
||||
#define GENARRAY( kername2, kername4, kername6, kername8, \
|
||||
kername10, kername12, kername14, kername16 ) \
|
||||
\
|
||||
static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \
|
||||
{ \
|
||||
/* panel width = 0 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 1 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 2 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername2,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername2,_ri), \
|
||||
}, \
|
||||
/* panel width = 3 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 4 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername4,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername4,_ri), \
|
||||
}, \
|
||||
/* panel width = 5 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 6 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername6,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername6,_ri), \
|
||||
}, \
|
||||
/* panel width = 7 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 8 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername8,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername8,_ri), \
|
||||
}, \
|
||||
/* panel width = 9 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 10 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername10,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername10,_ri), \
|
||||
}, \
|
||||
/* panel width = 11 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 12 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername12,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername12,_ri), \
|
||||
}, \
|
||||
/* panel width = 13 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 14 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername14,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername14,_ri), \
|
||||
}, \
|
||||
/* panel width = 15 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 16 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername16,_ri), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername16,_ri), \
|
||||
}, \
|
||||
/* panel width = 17 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
};
|
||||
|
||||
GENARRAY( PACKM_2XK_KERNEL,
|
||||
PACKM_4XK_KERNEL,
|
||||
PACKM_6XK_KERNEL,
|
||||
PACKM_8XK_KERNEL,
|
||||
PACKM_10XK_KERNEL,
|
||||
PACKM_12XK_KERNEL,
|
||||
PACKM_14XK_KERNEL,
|
||||
PACKM_16XK_KERNEL )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t i, j; \
|
||||
num_t dt; \
|
||||
FUNCPTR_T f; \
|
||||
\
|
||||
/* The panel dimension is always equal to the m dimension of p. */ \
|
||||
panel_dim = m; \
|
||||
\
|
||||
/* Acquire the datatype for the current function. */ \
|
||||
dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Index into the array to extract the correct function pointer.
|
||||
If the panel dimension is too big to be within the array of
|
||||
explicitly handled kernels, then we treat that kernel the same
|
||||
as if it were in range but unimplemented. */ \
|
||||
if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \
|
||||
else f = NULL; \
|
||||
\
|
||||
/* If there exists a kernel implementation for the panel dimension
|
||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||
if ( f != NULL ) \
|
||||
{ \
|
||||
f( conja, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, psp, ldp ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ctype_r* restrict kappa_r = ( ctype_r* )kappa; \
|
||||
ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
|
||||
dim_t inca2 = 2*inca; \
|
||||
dim_t lda2 = 2*lda; \
|
||||
\
|
||||
/* Treat the panel as m x n and column-stored (unit row stride). */ \
|
||||
\
|
||||
/* NOTE: The loops below are inlined versions of scal2m, but
|
||||
for separated real/imaginary storage. */ \
|
||||
\
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal2jris)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal2ris)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_cxk_ri )
|
||||
|
||||
59
frame/1m/packm/bli_packm_cxk_ri.h
Normal file
59
frame/1m/packm/bli_packm_cxk_ri.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// Include headers for various packm kernels.
|
||||
#include "bli_packm_ref_2xk.h"
|
||||
#include "bli_packm_ref_4xk.h"
|
||||
#include "bli_packm_ref_6xk.h"
|
||||
#include "bli_packm_ref_8xk.h"
|
||||
#include "bli_packm_ref_10xk.h"
|
||||
#include "bli_packm_ref_12xk.h"
|
||||
#include "bli_packm_ref_14xk.h"
|
||||
#include "bli_packm_ref_16xk.h"
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_cxk_ri )
|
||||
|
||||
270
frame/1m/packm/bli_packm_cxk_ri3.c
Normal file
270
frame/1m/packm/bli_packm_cxk_ri3.c
Normal file
@@ -0,0 +1,270 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T packm_cxk_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
void* kappa,
|
||||
void* a, inc_t inca, inc_t lda,
|
||||
void* p, inc_t psp, inc_t ldp
|
||||
);
|
||||
|
||||
#undef FUNCPTR_ARRAY_LENGTH
|
||||
#define FUNCPTR_ARRAY_LENGTH 18
|
||||
|
||||
#undef GENARRAY
|
||||
#define GENARRAY( kername2, kername4, kername6, kername8, \
|
||||
kername10, kername12, kername14, kername16 ) \
|
||||
\
|
||||
static FUNCPTR_T ftypes[FUNCPTR_ARRAY_LENGTH][BLIS_NUM_FP_TYPES] = \
|
||||
{ \
|
||||
/* panel width = 0 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 1 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 2 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername2,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername2,_ri3), \
|
||||
}, \
|
||||
/* panel width = 3 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 4 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername4,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername4,_ri3), \
|
||||
}, \
|
||||
/* panel width = 5 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 6 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername6,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername6,_ri3), \
|
||||
}, \
|
||||
/* panel width = 7 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 8 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername8,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername8,_ri3), \
|
||||
}, \
|
||||
/* panel width = 9 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 10 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername10,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername10,_ri3), \
|
||||
}, \
|
||||
/* panel width = 11 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 12 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername12,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername12,_ri3), \
|
||||
}, \
|
||||
/* panel width = 13 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 14 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername14,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername14,_ri3), \
|
||||
}, \
|
||||
/* panel width = 15 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
/* panel width = 16 */ \
|
||||
{ \
|
||||
NULL, \
|
||||
PASTEMAC2(c,kername16,_ri3), \
|
||||
NULL, \
|
||||
PASTEMAC2(z,kername16,_ri3), \
|
||||
}, \
|
||||
/* panel width = 17 */ \
|
||||
{ \
|
||||
NULL, NULL, NULL, NULL, \
|
||||
}, \
|
||||
};
|
||||
|
||||
GENARRAY( PACKM_2XK_KERNEL,
|
||||
PACKM_4XK_KERNEL,
|
||||
PACKM_6XK_KERNEL,
|
||||
PACKM_8XK_KERNEL,
|
||||
PACKM_10XK_KERNEL,
|
||||
PACKM_12XK_KERNEL,
|
||||
PACKM_14XK_KERNEL,
|
||||
PACKM_16XK_KERNEL )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, scal2vker ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* kappa, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
dim_t panel_dim; \
|
||||
dim_t i, j; \
|
||||
num_t dt; \
|
||||
FUNCPTR_T f; \
|
||||
\
|
||||
/* The panel dimension is always equal to the m dimension of p. */ \
|
||||
panel_dim = m; \
|
||||
\
|
||||
/* Acquire the datatype for the current function. */ \
|
||||
dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
/* Index into the array to extract the correct function pointer.
|
||||
If the panel dimension is too big to be within the array of
|
||||
explicitly handled kernels, then we treat that kernel the same
|
||||
as if it were in range but unimplemented. */ \
|
||||
if ( panel_dim < FUNCPTR_ARRAY_LENGTH ) f = ftypes[panel_dim][dt]; \
|
||||
else f = NULL; \
|
||||
\
|
||||
/* If there exists a kernel implementation for the panel dimension
|
||||
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
|
||||
if ( f != NULL ) \
|
||||
{ \
|
||||
f( conja, \
|
||||
n, \
|
||||
kappa, \
|
||||
a, inca, lda, \
|
||||
p, psp, ldp ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
ctype_r* restrict kappa_r = ( ctype_r* )kappa; \
|
||||
ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict p_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict p_ri = ( ctype_r* )p + 2*psp; \
|
||||
dim_t inca2 = 2*inca; \
|
||||
dim_t lda2 = 2*lda; \
|
||||
\
|
||||
/* Treat the panel as m x n and column-stored (unit row stride). */ \
|
||||
\
|
||||
/* NOTE: The loops below are inlined versions of scal2m, but
|
||||
for separated real/imaginary storage. */ \
|
||||
\
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal2jri3s)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i, \
|
||||
*pi11_ri ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else /* if ( bli_is_noconj( conja ) ) */ \
|
||||
{ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
{ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r* restrict alpha11_r = a_r + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict alpha11_i = a_i + (i )*inca2 + (j )*lda2; \
|
||||
ctype_r* restrict pi11_r = p_r + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_i = p_i + (i )*1 + (j )*ldp; \
|
||||
ctype_r* restrict pi11_ri = p_ri + (i )*1 + (j )*ldp; \
|
||||
\
|
||||
PASTEMAC(ch,scal2ri3s)( *kappa_r, \
|
||||
*kappa_i, \
|
||||
*alpha11_r, \
|
||||
*alpha11_i, \
|
||||
*pi11_r, \
|
||||
*pi11_i, \
|
||||
*pi11_ri ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( packm_cxk_ri3, SCAL2V_KERNEL )
|
||||
|
||||
59
frame/1m/packm/bli_packm_cxk_ri3.h
Normal file
59
frame/1m/packm/bli_packm_cxk_ri3.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
// Include headers for various packm kernels.
|
||||
#include "bli_packm_ref_2xk.h"
|
||||
#include "bli_packm_ref_4xk.h"
|
||||
#include "bli_packm_ref_6xk.h"
|
||||
#include "bli_packm_ref_8xk.h"
|
||||
#include "bli_packm_ref_10xk.h"
|
||||
#include "bli_packm_ref_12xk.h"
|
||||
#include "bli_packm_ref_14xk.h"
|
||||
#include "bli_packm_ref_16xk.h"
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_cxk_ri3 )
|
||||
|
||||
@@ -105,13 +105,13 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
@@ -121,16 +121,281 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_gen_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri3)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_gen_cxk_ri3 )
|
||||
|
||||
|
||||
@@ -51,3 +51,25 @@ void PASTEMAC(ch,varname)( \
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_gen_cxk )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_gen_cxk_ri3 )
|
||||
|
||||
@@ -87,8 +87,8 @@ void PASTEMAC(ch,varname)( \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_len = m_panel; \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
@@ -98,8 +98,8 @@ void PASTEMAC(ch,varname)( \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_len = n_panel; \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
@@ -167,12 +167,6 @@ void PASTEMAC(ch,varname)( \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
@@ -199,19 +193,13 @@ void PASTEMAC(ch,varname)( \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to P10. For upper storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
@@ -219,8 +207,8 @@ void PASTEMAC(ch,varname)( \
|
||||
c10, incc10, ldc10, \
|
||||
p10, ldp ); \
|
||||
\
|
||||
/* Pack to P12. For lower storage, this includes the unstored
|
||||
triangle of C11. */ \
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
@@ -228,29 +216,37 @@ void PASTEMAC(ch,varname)( \
|
||||
c12, incc12, ldc12, \
|
||||
p12, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangule of C11 to P11. */ \
|
||||
PASTEMAC3(ch,ch,ch,scal2m_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix C is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of P11 in case the
|
||||
corresponding elements in C11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
/* NOTE: We can directly increment p11 since we are done
|
||||
using p11 for the remainder of the function. */ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *p11 ); \
|
||||
p11_m = panel_dim; \
|
||||
p11_n = panel_dim; \
|
||||
j = diagoffc_abs; \
|
||||
p11 = p + (j )*ldp; \
|
||||
c11 = c + (j )*ldc; \
|
||||
\
|
||||
p11 += rs_p11 + cs_p11; \
|
||||
PASTEMAC(ch,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
conjc, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa, \
|
||||
c11, rs_c, cs_c, \
|
||||
p11, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
/* NOTE: We can directly increment p11 since we are done
|
||||
using p11 for the remainder of the function. */ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
PASTEMAC(ch,seti0s)( *p11 ); \
|
||||
\
|
||||
p11 += rs_p11 + cs_p11; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
@@ -269,13 +265,13 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
@@ -285,16 +281,721 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_herm_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t panel_dim; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
ctype* restrict c10; \
|
||||
ctype_r* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype_r* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, psp, ldp ); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, psp, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
dim_t p11_m = panel_dim; \
|
||||
dim_t p11_n = panel_dim; \
|
||||
inc_t rs_c11 = 2*rs_c; \
|
||||
inc_t cs_c11 = 2*cs_c; \
|
||||
dim_t j = diagoffc_abs; \
|
||||
ctype* c11 = ( ctype* )c + (j )*ldc; \
|
||||
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
|
||||
ctype_r* c11_r = ( ctype_r* )c11; \
|
||||
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
|
||||
ctype_r* p11_r = ( ctype_r* )p11; \
|
||||
ctype_r* p11_i = ( ctype_r* )p11 + psp; \
|
||||
ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \
|
||||
ctype_r* alpha_r = one_r; \
|
||||
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
|
||||
\
|
||||
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_r, \
|
||||
c11_r, rs_c11, cs_c11, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
|
||||
scaling by -1 if conjugation on c was requested. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_i, \
|
||||
c11_i, rs_c11, cs_c11, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Apply kappa to the part of p11 that corresponds to the stored
|
||||
part of c11 that was copied above. */ \
|
||||
if ( bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_u)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa_r, \
|
||||
kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_l)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa_r, \
|
||||
kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
|
||||
p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
|
||||
p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
|
||||
\
|
||||
ctype_r* restrict p_r = ( ctype_r* )p; \
|
||||
\
|
||||
dim_t i, j; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
doff_t diagoffc_abs; \
|
||||
dim_t panel_dim; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
ctype* restrict c10; \
|
||||
ctype_r* restrict p10; \
|
||||
dim_t p10_dim, p10_len; \
|
||||
inc_t incc10, ldc10; \
|
||||
doff_t diagoffc10; \
|
||||
conj_t conjc10; \
|
||||
\
|
||||
ctype* restrict c12; \
|
||||
ctype_r* restrict p12; \
|
||||
dim_t p12_dim, p12_len; \
|
||||
inc_t incc12, ldc12; \
|
||||
doff_t diagoffc12; \
|
||||
conj_t conjc12; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
if ( !bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
/* If the current panel is unstored, we need to make a few
|
||||
adjustments so we refer to the data where it is actually
|
||||
stored, also taking conjugation into account. (Note this
|
||||
implicitly assumes we are operating on a dense panel
|
||||
within a larger symmetric or Hermitian matrix, since a
|
||||
general matrix would not contain any unstored region.) */ \
|
||||
if ( bli_is_unstored_subpart_n( diagoffc, uploc, m_panel, n_panel ) ) \
|
||||
{ \
|
||||
c = c + diagoffc * ( doff_t )cs_c + \
|
||||
-diagoffc * ( doff_t )rs_c; \
|
||||
bli_swap_incs( incc, ldc ); \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc ); \
|
||||
} \
|
||||
\
|
||||
/* Pack the full panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri3)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
} \
|
||||
else /* if ( bli_intersects_diag_n( diagoffc, m_panel, n_panel ) ) */ \
|
||||
{ \
|
||||
/* Sanity check. Diagonals should not intersect the short end of
|
||||
a micro-panel. If they do, then somehow the constraints on
|
||||
cache blocksizes being a whole multiple of the register
|
||||
blocksizes was somehow violated. */ \
|
||||
if ( ( bli_is_col_stored_f( rs_p, cs_p ) && diagoffc < 0 ) || \
|
||||
( bli_is_row_stored_f( rs_p, cs_p ) && diagoffc > 0 ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
diagoffc_abs = bli_abs( diagoffc ); \
|
||||
\
|
||||
if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) ) \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
incc10 = incc; \
|
||||
ldc10 = ldc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
diagoffc12 = diagoffc_abs - j; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
c12 = c12 + diagoffc12 * ( doff_t )cs_c + \
|
||||
-diagoffc12 * ( doff_t )rs_c; \
|
||||
incc12 = ldc; \
|
||||
ldc12 = incc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc12 ); \
|
||||
} \
|
||||
else /* if ( ( bli_is_row_stored_f( rs_p, cs_p ) && bli_is_lower( uploc ) ) || \
|
||||
( bli_is_col_stored_f( rs_p, cs_p ) && bli_is_upper( uploc ) ) ) */ \
|
||||
{ \
|
||||
p10_dim = panel_dim; \
|
||||
p10_len = diagoffc_abs + panel_dim; \
|
||||
diagoffc10 = diagoffc; \
|
||||
p10 = p_r; \
|
||||
c10 = c; \
|
||||
c10 = c10 + diagoffc10 * ( doff_t )cs_c + \
|
||||
-diagoffc10 * ( doff_t )rs_c; \
|
||||
incc10 = ldc; \
|
||||
ldc10 = incc; \
|
||||
conjc10 = conjc; \
|
||||
\
|
||||
p12_dim = panel_dim; \
|
||||
p12_len = panel_len - p10_len; \
|
||||
j = p10_len; \
|
||||
p12 = p_r + (j )*ldp; \
|
||||
c12 = c + (j )*ldc; \
|
||||
incc12 = incc; \
|
||||
ldc12 = ldc; \
|
||||
conjc12 = conjc; \
|
||||
\
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
bli_toggle_conj( conjc10 ); \
|
||||
} \
|
||||
\
|
||||
/* Pack to p10. For upper storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri3)( conjc10, \
|
||||
p10_dim, \
|
||||
p10_len, \
|
||||
kappa, \
|
||||
c10, incc10, ldc10, \
|
||||
p10, psp, ldp ); \
|
||||
\
|
||||
/* Pack to p12. For lower storage, this includes the unstored
|
||||
triangle of c11. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri3)( conjc12, \
|
||||
p12_dim, \
|
||||
p12_len, \
|
||||
kappa, \
|
||||
c12, incc12, ldc12, \
|
||||
p12, psp, ldp ); \
|
||||
\
|
||||
/* Pack the stored triangle of c11 to p11. */ \
|
||||
{ \
|
||||
dim_t p11_m = panel_dim; \
|
||||
dim_t p11_n = panel_dim; \
|
||||
inc_t rs_c11 = 2*rs_c; \
|
||||
inc_t cs_c11 = 2*cs_c; \
|
||||
dim_t j = diagoffc_abs; \
|
||||
ctype* c11 = ( ctype* )c + (j )*ldc; \
|
||||
ctype_r* p11 = ( ctype_r* )p_r + (j )*ldp; \
|
||||
ctype_r* c11_r = ( ctype_r* )c11; \
|
||||
ctype_r* c11_i = ( ctype_r* )c11 + 1; \
|
||||
ctype_r* p11_r = ( ctype_r* )p11; \
|
||||
ctype_r* p11_i = ( ctype_r* )p11 + psp; \
|
||||
ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \
|
||||
ctype_r* alpha_r = one_r; \
|
||||
ctype_r* alpha_i = ( bli_is_conj( conjc ) ? minus_one_r : one_r ); \
|
||||
\
|
||||
/* Copy the real part of the stored triangle of c11 to p11_r. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_r, \
|
||||
c11_r, rs_c11, cs_c11, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* Copy the imaginary part of the stored triangle of c11 to p11_i,
|
||||
scaling by -1 if conjugation on c was requested. */ \
|
||||
PASTEMAC(chr,scal2m)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uploc, \
|
||||
BLIS_NO_TRANSPOSE, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
alpha_i, \
|
||||
c11_i, rs_c11, cs_c11, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
\
|
||||
/* If source matrix c is Hermitian, we have to zero out the
|
||||
imaginary components of the diagonal of p11 in case the
|
||||
corresponding elements in c11 were not already zero. */ \
|
||||
if ( bli_is_hermitian( strucc ) ) \
|
||||
{ \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (i )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,set0s)( *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Apply kappa to the part of p11 that corresponds to the stored
|
||||
part of c11 that was copied above. */ \
|
||||
if ( bli_is_upper( uploc ) ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_u)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa_r, \
|
||||
kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(ch,scalris_mxn_l)( 0, \
|
||||
p11_m, \
|
||||
p11_n, \
|
||||
kappa_r, \
|
||||
kappa_i, \
|
||||
p11_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* Update the p11 section of the ri panel. It simply needs
|
||||
to contain the sum of p11_r + p11_i. */ \
|
||||
{ \
|
||||
ctype_r* p11_ri = p11_i + psp; \
|
||||
\
|
||||
for ( j = 0; j < p11_n; ++j ) \
|
||||
for ( i = 0; i < p11_m; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p11 + (j )*cs_p11; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p11 + (j )*cs_p11; \
|
||||
ctype_r* pi11_ri = p11_ri + (i )*rs_p11 + (j )*cs_p11; \
|
||||
\
|
||||
PASTEMAC(chr,add3s)( *pi11_r, \
|
||||
*pi11_i, \
|
||||
*pi11_ri ); \
|
||||
} \
|
||||
} \
|
||||
/*
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_r copied", m_panel_max, n_panel_max, \
|
||||
p_r + 0*psp, rs_p, cs_p, "%4.1f", "" ); \
|
||||
PASTEMAC(chr,fprintm)( stdout, "packm_herm_cxk: ap_i copied", m_panel_max, n_panel_max, \
|
||||
p_r + 1*psp, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_herm_cxk_ri3 )
|
||||
|
||||
|
||||
@@ -51,3 +51,25 @@ void PASTEMAC(ch,varname)( \
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_herm_cxk )
|
||||
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_herm_cxk_ri3 )
|
||||
|
||||
@@ -303,7 +303,9 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = cs_p * n_p_pad * elem_size_p;
|
||||
}
|
||||
else if ( pack_schema == BLIS_PACKED_ROW_PANELS )
|
||||
else if ( pack_schema == BLIS_PACKED_ROW_PANELS ||
|
||||
pack_schema == BLIS_PACKED_ROW_PANELS_4M ||
|
||||
pack_schema == BLIS_PACKED_ROW_PANELS_3M )
|
||||
{
|
||||
dim_t m_panel;
|
||||
dim_t ps_p;
|
||||
@@ -331,11 +333,14 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// dimension of the matrix is not a whole multiple of MR.
|
||||
ps_p = cs_p * n_p_pad;
|
||||
|
||||
if ( pack_schema == BLIS_PACKED_ROW_PANELS_3M )
|
||||
ps_p = ( ps_p * 3 ) / 2;
|
||||
|
||||
// Align the panel dimension according to the contiguous memory
|
||||
// stride alignment size so that the second, third, etc panels begin
|
||||
// at aligned addresses.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
//ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
// BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
|
||||
// Store the strides and panel dimension in p.
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
@@ -345,7 +350,9 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// Compute the size of the packed buffer.
|
||||
size_p = ps_p * (m_p_pad / m_panel) * elem_size_p;
|
||||
}
|
||||
else if ( pack_schema == BLIS_PACKED_COL_PANELS )
|
||||
else if ( pack_schema == BLIS_PACKED_COL_PANELS ||
|
||||
pack_schema == BLIS_PACKED_COL_PANELS_4M ||
|
||||
pack_schema == BLIS_PACKED_COL_PANELS_3M )
|
||||
{
|
||||
dim_t n_panel;
|
||||
dim_t ps_p;
|
||||
@@ -373,11 +380,14 @@ void bli_packm_init_pack( bool_t densify,
|
||||
// dimension of the matrix is not a whole multiple of NR.
|
||||
ps_p = m_p_pad * rs_p;
|
||||
|
||||
if ( pack_schema == BLIS_PACKED_COL_PANELS_3M )
|
||||
ps_p = ( ps_p * 3 ) / 2;
|
||||
|
||||
// Align the panel dimension according to the contiguous memory
|
||||
// stride alignment size so that the second, third, etc panels begin
|
||||
// at aligned addresses.
|
||||
ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
//ps_p = bli_align_dim_to_size( ps_p, elem_size_p,
|
||||
// BLIS_CONTIG_STRIDE_ALIGN_SIZE );
|
||||
|
||||
// Store the strides and panel dimension in p.
|
||||
bli_obj_set_incs( rs_p, cs_p, *p );
|
||||
|
||||
@@ -44,8 +44,8 @@ static FUNCPTR_T vars[6][3] =
|
||||
// unblocked optimized unblocked blocked
|
||||
{ bli_packm_unb_var1, NULL, bli_packm_blk_var1 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, bli_packm_blk_var3 },
|
||||
{ NULL, NULL, bli_packm_blk_var4 },
|
||||
{ NULL, NULL, NULL, },
|
||||
{ NULL, NULL, NULL, },
|
||||
};
|
||||
|
||||
@@ -82,6 +82,7 @@ void PASTEMAC(ch,varname)( \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk)( conjc, \
|
||||
@@ -91,44 +92,48 @@ void PASTEMAC(ch,varname)( \
|
||||
c, incc, ldc, \
|
||||
p, ldp ); \
|
||||
\
|
||||
/* If the diagonal of C is implicitly unit, set the diagonal of
|
||||
the packed panel to unit. */ \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,setd_unb_var1)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa, \
|
||||
p, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
PASTEMAC(ch,invertd_unb_var1)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,invertd)( diagoffp, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of P to zero. To do this,
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). */ \
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop = uploc; \
|
||||
\
|
||||
bli_toggle_uplo( uplop ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop, diagoffp ); \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero, \
|
||||
p, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( diagoffp, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
zero, \
|
||||
p, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
@@ -145,13 +150,13 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype* p_edge = p + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
@@ -161,18 +166,23 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype* p_edge = p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setm_unb_var1)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
PASTEMAC(ch,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity. */ \
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
@@ -181,27 +191,530 @@ void PASTEMAC(ch,varname)( \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype* one = PASTEMAC(ch,1); \
|
||||
ctype* p_edge = p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype* p_br = p + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC2(ch,ch,setd_unb_var1)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_edge, rs_p, cs_p ); \
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: setting br unit diag", m_br, n_br, \
|
||||
p_edge, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
PASTEMAC(ch,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one, \
|
||||
p_br, rs_p, cs_p ); \
|
||||
} \
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: setting br unit diag", m_br, n_br, \
|
||||
p_edge, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
/*
|
||||
if ( rs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: ap copied", m_panel_max, n_panel_max, \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: ap copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
if ( cs_p == 1 ) \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var3: bp copied", m_panel_max, n_panel_max, \
|
||||
PASTEMAC(ch,fprintm)( stdout, "packm_var1: bp copied", m_panel_max, n_panel_max, \
|
||||
p, rs_p, cs_p, "%4.1f", "" ); \
|
||||
*/ \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( packm_tri_cxk )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
\
|
||||
dim_t i; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
\
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri )
|
||||
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
) \
|
||||
{ \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
\
|
||||
dim_t i; \
|
||||
dim_t panel_dim; \
|
||||
dim_t panel_len; \
|
||||
dim_t panel_len_max; \
|
||||
inc_t incc, ldc; \
|
||||
inc_t psp, ldp; \
|
||||
\
|
||||
inc_t rs_p11, cs_p11; \
|
||||
\
|
||||
\
|
||||
/* If the strides of p indicate row storage, then we are packing to
|
||||
column panels; otherwise, if the strides indicate column storage,
|
||||
we are packing to row panels. */ \
|
||||
if ( bli_is_row_stored_f( rs_p, cs_p ) ) \
|
||||
{ \
|
||||
/* Prepare to pack to row-stored column panel. */ \
|
||||
panel_dim = n_panel; \
|
||||
panel_len = m_panel; \
|
||||
panel_len_max = m_panel_max; \
|
||||
incc = cs_c; \
|
||||
ldc = rs_c; \
|
||||
ldp = rs_p; \
|
||||
rs_p11 = rs_p; \
|
||||
cs_p11 = 1; \
|
||||
} \
|
||||
else /* if ( bli_is_col_stored_f( rs_p, cs_p ) ) */ \
|
||||
{ \
|
||||
/* Prepare to pack to column-stored row panel. */ \
|
||||
panel_dim = m_panel; \
|
||||
panel_len = n_panel; \
|
||||
panel_len_max = n_panel_max; \
|
||||
incc = rs_c; \
|
||||
ldc = cs_c; \
|
||||
ldp = cs_p; \
|
||||
rs_p11 = 1; \
|
||||
cs_p11 = cs_p; \
|
||||
} \
|
||||
\
|
||||
/* Compute the panel stride (ie: the element offset to the imaginary
|
||||
panel). */ \
|
||||
psp = ldp * panel_len_max; \
|
||||
\
|
||||
\
|
||||
/* Pack the panel. */ \
|
||||
PASTEMAC(ch,packm_cxk_ri3)( conjc, \
|
||||
panel_dim, \
|
||||
panel_len, \
|
||||
kappa, \
|
||||
c, incc, ldc, \
|
||||
p, psp, ldp ); \
|
||||
\
|
||||
\
|
||||
/* Tweak the panel according to its triangular structure */ \
|
||||
{ \
|
||||
dim_t j = bli_abs( diagoffp ); \
|
||||
ctype_r* p11_r = ( ctype_r* )p + (j )*ldp; \
|
||||
ctype_r* p11_i = ( ctype_r* )p + psp + (j )*ldp; \
|
||||
ctype_r* p11_ri = ( ctype_r* )p + 2*psp + (j )*ldp; \
|
||||
\
|
||||
/* If the diagonal of c is implicitly unit, explicitly set the
|
||||
the diagonal of the packed panel to kappa. */ \
|
||||
if ( bli_is_unit_diag( diagc ) ) \
|
||||
{ \
|
||||
ctype_r* kappa_r = &PASTEMAC(ch,real)( *kappa ); \
|
||||
ctype_r* kappa_i = &PASTEMAC(ch,imag)( *kappa ); \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_i, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_panel, \
|
||||
n_panel, \
|
||||
kappa_i, \
|
||||
p11_ri, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
\
|
||||
/* If requested, invert the diagonal of the packed panel. Note
|
||||
that we do not need to update the ri panel since inverted
|
||||
diagonals are only needed by trsm, which does not use the
|
||||
p11 section of the ri panel. */ \
|
||||
if ( invdiag == TRUE ) \
|
||||
{ \
|
||||
for ( i = 0; i < panel_dim; ++i ) \
|
||||
{ \
|
||||
ctype_r* pi11_r = p11_r + (i )*rs_p + (i )*cs_p; \
|
||||
ctype_r* pi11_i = p11_i + (i )*rs_p + (i )*cs_p; \
|
||||
\
|
||||
PASTEMAC(ch,invertris)( *pi11_r, *pi11_i ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* Set the region opposite the diagonal of p to zero. To do this,
|
||||
we need to reference the "unstored" region on the other side of
|
||||
the diagonal. This amounts to toggling uploc and then shifting
|
||||
the diagonal offset to shrink the newly referenced region (by
|
||||
one diagonal). Note that this zero-filling is not needed for
|
||||
trsm, since the unstored region is not referenced by the trsm
|
||||
micro-kernel; however, zero-filling is needed for trmm, which
|
||||
uses the gemm micro-kernel.*/ \
|
||||
{ \
|
||||
uplo_t uplop11 = uploc; \
|
||||
doff_t diagoffp11 = 0; \
|
||||
\
|
||||
bli_toggle_uplo( uplop11 ); \
|
||||
bli_shift_diag_offset_to_shrink_uplo( uplop11, diagoffp11 ); \
|
||||
\
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_r, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_i, rs_p11, cs_p11 ); \
|
||||
PASTEMAC(chr,setm)( diagoffp11, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
uplop11, \
|
||||
panel_dim, \
|
||||
panel_dim, \
|
||||
zero_r, \
|
||||
p11_ri, rs_p11, cs_p11 ); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* The packed memory region was acquired/allocated with "aligned"
|
||||
dimensions (ie: dimensions that were possibly inflated up to a
|
||||
multiple). When these dimension are inflated, it creates empty
|
||||
regions along the bottom and/or right edges of the matrix. If
|
||||
either region exists, we set them to zero. This allows the
|
||||
micro-kernel to remain simple since it does not need to support
|
||||
different register blockings for the edge cases. */ \
|
||||
if ( m_panel != m_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t m_edge = m_panel_max - i; \
|
||||
dim_t n_edge = n_panel_max; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (i )*rs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (i )*rs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (i )*rs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
if ( n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_edge = m_panel_max; \
|
||||
dim_t n_edge = n_panel_max - j; \
|
||||
ctype_r* p_edge_r = ( ctype_r* )p + (j )*cs_p; \
|
||||
ctype_r* p_edge_i = ( ctype_r* )p + psp + (j )*cs_p; \
|
||||
ctype_r* p_edge_ri = ( ctype_r* )p + 2*psp + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_i, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setm)( 0, \
|
||||
BLIS_NONUNIT_DIAG, \
|
||||
BLIS_DENSE, \
|
||||
m_edge, \
|
||||
n_edge, \
|
||||
zero_r, \
|
||||
p_edge_ri, rs_p, cs_p ); \
|
||||
} \
|
||||
\
|
||||
/* If this panel is an edge case in both panel dimension and length,
|
||||
then it must be a bottom-right corner case. Set the part of the
|
||||
diagonal that extends into the zero-padded region to identity.
|
||||
NOTE: This is actually only necessary when packing for trsm, as
|
||||
it helps prevent NaNs and Infs from creeping into the computation.
|
||||
However, we set the region to identity for trmm as well. Those
|
||||
1.0's end up getting muliplied by the 0.0's in the zero-padded
|
||||
region of the other matrix, so there is no harm in this. */ \
|
||||
if ( m_panel != m_panel_max && \
|
||||
n_panel != n_panel_max ) \
|
||||
{ \
|
||||
dim_t i = m_panel; \
|
||||
dim_t j = n_panel; \
|
||||
dim_t m_br = m_panel_max - i; \
|
||||
dim_t n_br = n_panel_max - j; \
|
||||
ctype_r* p_br_r = ( ctype_r* )p + (i )*rs_p + (j )*cs_p; \
|
||||
ctype_r* p_br_i = ( ctype_r* )p + psp + (i )*rs_p + (j )*cs_p; \
|
||||
\
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
one_r, \
|
||||
p_br_r, rs_p, cs_p ); \
|
||||
PASTEMAC(chr,setd)( 0, \
|
||||
m_br, \
|
||||
n_br, \
|
||||
zero_r, \
|
||||
p_br_i, rs_p, cs_p ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_tri_cxk_ri3 )
|
||||
|
||||
|
||||
@@ -53,3 +53,27 @@ void PASTEMAC(ch,varname)( \
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_tri_cxk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
struc_t strucc, \
|
||||
doff_t diagoffp, \
|
||||
diag_t diagc, \
|
||||
uplo_t uploc, \
|
||||
conj_t conjc, \
|
||||
bool_t invdiag, \
|
||||
dim_t m_panel, \
|
||||
dim_t n_panel, \
|
||||
dim_t m_panel_max, \
|
||||
dim_t n_panel_max, \
|
||||
ctype* restrict kappa, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
ctype* restrict p, inc_t rs_p, inc_t cs_p \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri )
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_tri_cxk_ri3 )
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,16 +55,16 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -74,16 +74,16 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -96,16 +96,16 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -115,16 +115,16 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -133,5 +133,246 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_10xk, packm_ref_10xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_10xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_10xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_10xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_10xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,18 +55,18 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -76,18 +76,18 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -100,18 +100,18 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -121,18 +121,18 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -141,5 +141,262 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_12xk, packm_ref_12xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_12xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_12xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_12xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_12xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,20 +55,20 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -78,20 +78,20 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -104,20 +104,20 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -127,20 +127,20 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -149,5 +149,278 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_14xk, packm_ref_14xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_14xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_14xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_14xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_14xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,22 +55,22 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 14*inca), *(pi1 + 14) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 15*inca), *(pi1 + 15) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -80,22 +80,22 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 14*inca), *(pi1 + 14) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 15*inca), *(pi1 + 15) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -108,22 +108,22 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -133,22 +133,22 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 10*inca), *(pi1 + 10) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 11*inca), *(pi1 + 11) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 12*inca), *(pi1 + 12) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 13*inca), *(pi1 + 13) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 14*inca), *(pi1 + 14) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 15*inca), *(pi1 + 15) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -157,5 +157,294 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_16xk, packm_ref_16xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_16xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8), *(pi1_ri + 8) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9), *(pi1_ri + 9) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10), *(pi1_ri +10) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11), *(pi1_ri +11) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12), *(pi1_ri +12) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13), *(pi1_ri +13) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14), *(pi1_ri +14) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15), *(pi1_ri +15) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_16xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_16xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_16xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,8 +55,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -66,8 +66,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -80,8 +80,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -91,8 +91,8 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -101,5 +101,182 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_2xk, packm_ref_2xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_2xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_2xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_2xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_2xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -109,5 +109,198 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_4xk, packm_ref_4xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_4xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_4xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,32 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_4xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_4xk_ri3 )
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,12 +55,12 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -70,12 +70,12 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -88,12 +88,12 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -103,12 +103,12 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -117,5 +117,214 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_6xk, packm_ref_6xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_6xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_6xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_6xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_6xk_ri3 )
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
@@ -55,14 +55,14 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -72,14 +72,14 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC2(ch,ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -92,14 +92,14 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2js)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -109,14 +109,14 @@ void PASTEMAC(ch,varname)( \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC3(ch,ch,ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
|
||||
PASTEMAC(ch,scal2s)( *beta_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
|
||||
\
|
||||
alpha1 += lda; \
|
||||
pi1 += ldp; \
|
||||
@@ -125,5 +125,230 @@ void PASTEMAC(ch,varname)( \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( packm_ref_8xk, packm_ref_8xk )
|
||||
INSERT_GENTFUNC_BASIC0( packm_ref_8xk )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2jris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri )
|
||||
|
||||
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
) \
|
||||
{ \
|
||||
const inc_t inca2 = 2 * inca; \
|
||||
const inc_t lda2 = 2 * lda; \
|
||||
\
|
||||
ctype* beta_cast = beta; \
|
||||
ctype_r* restrict beta_r = ( ctype_r* )beta; \
|
||||
ctype_r* restrict beta_i = ( ctype_r* )beta + 1; \
|
||||
ctype_r* restrict alpha1_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \
|
||||
ctype_r* restrict pi1_r = ( ctype_r* )p; \
|
||||
ctype_r* restrict pi1_i = ( ctype_r* )p + psp; \
|
||||
ctype_r* restrict pi1_ri = ( ctype_r* )p + 2*psp; \
|
||||
\
|
||||
if ( PASTEMAC(ch,eq1)( *beta_cast ) ) \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyjri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,copyri3s)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
if ( bli_is_conj( conja ) ) \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2jri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for ( ; n != 0; --n ) \
|
||||
{ \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0), *(pi1_ri + 0) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1), *(pi1_ri + 1) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2), *(pi1_ri + 2) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3), *(pi1_ri + 3) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4), *(pi1_ri + 4) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5), *(pi1_ri + 5) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6), *(pi1_ri + 6) ); \
|
||||
PASTEMAC(ch,scal2ri3s)( *beta_r, *beta_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7), *(pi1_ri + 7) ); \
|
||||
\
|
||||
alpha1_r += lda2; \
|
||||
alpha1_i += lda2; \
|
||||
pi1_r += ldp; \
|
||||
pi1_i += ldp; \
|
||||
pi1_ri += ldp; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC0( packm_ref_8xk_ri3 )
|
||||
|
||||
|
||||
@@ -44,3 +44,31 @@ void PASTEMAC(ch,varname)( \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( packm_ref_8xk )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri )
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
conj_t conja, \
|
||||
dim_t n, \
|
||||
void* beta, \
|
||||
void* a, inc_t inca, inc_t lda, \
|
||||
void* p, inc_t psp, inc_t ldp \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( packm_ref_8xk_ri3 )
|
||||
|
||||
107
frame/3/gemm/3m/bli_gemm3m.c
Normal file
107
frame/3/gemm/3m/bli_gemm3m.c
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm3m_cntl;
|
||||
extern gemm_t* gemm_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_gemm3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
gemm_t* cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = gemm3m_cntl;
|
||||
else cntl = gemm_cntl;
|
||||
|
||||
bli_gemm_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm3m, gemm3m )
|
||||
|
||||
70
frame/3/gemm/3m/bli_gemm3m.h
Normal file
70
frame/3/gemm/3m/bli_gemm3m.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_gemm3m_cntl.h"
|
||||
|
||||
#include "bli_gemm3m_ref_mxn.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_gemm3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm3m )
|
||||
|
||||
220
frame/3/gemm/3m/bli_gemm3m_cntl.c
Normal file
220
frame/3/gemm/3m/bli_gemm3m_cntl.c
Normal file
@@ -0,0 +1,220 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
blksz_t* gemm3m_mc;
|
||||
blksz_t* gemm3m_nc;
|
||||
blksz_t* gemm3m_kc;
|
||||
blksz_t* gemm3m_mr;
|
||||
blksz_t* gemm3m_nr;
|
||||
blksz_t* gemm3m_kr;
|
||||
|
||||
func_t* gemm3m_ukrs;
|
||||
|
||||
packm_t* gemm3m_packa_cntl;
|
||||
packm_t* gemm3m_packb_cntl;
|
||||
|
||||
gemm_t* gemm3m_cntl_bp_ke;
|
||||
gemm_t* gemm3m_cntl_op_bp;
|
||||
gemm_t* gemm3m_cntl_mm_op;
|
||||
gemm_t* gemm3m_cntl_vl_mm;
|
||||
|
||||
gemm_t* gemm3m_cntl;
|
||||
|
||||
|
||||
void bli_gemm3m_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
gemm3m_mc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_MC_C, BLIS_EXTEND_3M_MC_C,
|
||||
BLIS_DEFAULT_3M_MC_Z, BLIS_EXTEND_3M_MC_Z );
|
||||
|
||||
gemm3m_nc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_NC_C, BLIS_EXTEND_3M_NC_C,
|
||||
BLIS_DEFAULT_3M_NC_Z, BLIS_EXTEND_3M_NC_Z );
|
||||
|
||||
gemm3m_kc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_KC_C, BLIS_EXTEND_3M_KC_C,
|
||||
BLIS_DEFAULT_3M_KC_Z, BLIS_EXTEND_3M_KC_Z );
|
||||
|
||||
gemm3m_mr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_MR_C, BLIS_EXTEND_3M_MR_C,
|
||||
BLIS_DEFAULT_3M_MR_Z, BLIS_EXTEND_3M_MR_Z );
|
||||
|
||||
gemm3m_nr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_NR_C, BLIS_EXTEND_3M_NR_C,
|
||||
BLIS_DEFAULT_3M_NR_Z, BLIS_EXTEND_3M_NR_Z );
|
||||
|
||||
gemm3m_kr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_3M_KR_C, BLIS_EXTEND_3M_KR_C,
|
||||
BLIS_DEFAULT_3M_KR_Z, BLIS_EXTEND_3M_KR_Z );
|
||||
|
||||
|
||||
|
||||
// Create function pointer object for each datatype-specific gemm
|
||||
// micro-kernel.
|
||||
gemm3m_ukrs = bli_func_obj_create( NULL,
|
||||
NULL,
|
||||
BLIS_CGEMM3M_UKERNEL,
|
||||
BLIS_ZGEMM3M_UKERNEL );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations.
|
||||
gemm3m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
gemm3m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kr,
|
||||
gemm3m_nr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
//
|
||||
// Create a control tree for packing A and B, and streaming C.
|
||||
//
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
gemm3m_cntl_bp_ke
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm3m_ukrs,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem.
|
||||
gemm3m_cntl_op_bp
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm3m_packa_cntl,
|
||||
gemm3m_packb_cntl,
|
||||
NULL,
|
||||
gemm3m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates.
|
||||
gemm3m_cntl_mm_op
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm3m_cntl_op_bp,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems.
|
||||
gemm3m_cntl_vl_mm
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm3m_cntl_mm_op,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" gemm control tree to a shorter name.
|
||||
gemm3m_cntl = gemm3m_cntl_vl_mm;
|
||||
|
||||
}
|
||||
|
||||
void bli_gemm3m_cntl_finalize()
|
||||
{
|
||||
bli_blksz_obj_free( gemm3m_mc );
|
||||
bli_blksz_obj_free( gemm3m_nc );
|
||||
bli_blksz_obj_free( gemm3m_kc );
|
||||
bli_blksz_obj_free( gemm3m_mr );
|
||||
bli_blksz_obj_free( gemm3m_nr );
|
||||
bli_blksz_obj_free( gemm3m_kr );
|
||||
|
||||
bli_func_obj_free( gemm3m_ukrs );
|
||||
|
||||
bli_cntl_obj_free( gemm3m_packa_cntl );
|
||||
bli_cntl_obj_free( gemm3m_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( gemm3m_cntl_bp_ke );
|
||||
bli_cntl_obj_free( gemm3m_cntl_op_bp );
|
||||
bli_cntl_obj_free( gemm3m_cntl_mm_op );
|
||||
bli_cntl_obj_free( gemm3m_cntl_vl_mm );
|
||||
|
||||
}
|
||||
|
||||
37
frame/3/gemm/3m/bli_gemm3m_cntl.h
Normal file
37
frame/3/gemm/3m/bli_gemm3m_cntl.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm3m_cntl_init( void );
|
||||
void bli_gemm3m_cntl_finalize( void );
|
||||
|
||||
208
frame/3/gemm/3m/ukernels/bli_gemm3m_ref_mxn.c
Normal file
208
frame/3/gemm/3m/ukernels/bli_gemm3m_ref_mxn.c
Normal file
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
ctype_r ct_r[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ct_i[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC(chr,mr); \
|
||||
\
|
||||
\
|
||||
ctype_r ab_r[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ab_i[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ab = 1; \
|
||||
const inc_t cs_ab = PASTEMAC(chr,mr); \
|
||||
\
|
||||
\
|
||||
const dim_t m = PASTEMAC(chr,mr); \
|
||||
const dim_t n = PASTEMAC(chr,nr); \
|
||||
\
|
||||
const inc_t ps_a = ( bli_auxinfo_ps_a( data ) * 2 ) / 3; \
|
||||
const inc_t ps_b = ( bli_auxinfo_ps_b( data ) * 2 ) / 3; \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \
|
||||
ctype_r* restrict a_ri = ( ctype_r* )a + 2*ps_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \
|
||||
ctype_r* restrict b_ri = ( ctype_r* )b + 2*ps_b; \
|
||||
\
|
||||
ctype_r* restrict c_r = ( ctype_r* )c; \
|
||||
ctype_r* restrict c_i = ( ctype_r* )c + 1; \
|
||||
\
|
||||
const inc_t rs_c2 = 2 * rs_c; \
|
||||
const inc_t cs_c2 = 2 * cs_c; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \
|
||||
ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 3m method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Copy the contents of c to a temporary buffer ct. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) \
|
||||
{ \
|
||||
/* We can handle a non-zero imaginary component on beta, but to do
|
||||
so we have to manually scale c and then use beta == 1 for the
|
||||
micro-kernel calls. */ \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, \
|
||||
*beta_i, \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
\
|
||||
/* Use beta.r == 1.0. */ \
|
||||
beta_r = one_r; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy c to ct without scaling. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* c.r = beta.r * c.r + a.r * b.r - a.i * b.i;
|
||||
c.i = beta.r * c.i + (a.r + a.i)(b.r + b.i) - a.r * b.r - a.i * b.i; */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
|
||||
\
|
||||
/* ab.r = a.r * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
zero_r, \
|
||||
ab_r, rs_ab, cs_ab, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_ri, b_ri, *data ); \
|
||||
\
|
||||
/* ab.i = a.i * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_i, \
|
||||
b_i, \
|
||||
zero_r, \
|
||||
ab_i, rs_ab, cs_ab, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
|
||||
\
|
||||
/* ct.i = a.ri * b.ri; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_ri, \
|
||||
b_ri, \
|
||||
beta_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
\
|
||||
/* ct.r = beta.r * ct.r + ab.r;
|
||||
ct.r = ct.r - ab.i;
|
||||
ct.i = ct.i - ab.r;
|
||||
ct.i = ct.i - ab.i; */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
{ \
|
||||
ctype_r alphabeta_r = *(ab_r + i*rs_ab + j*cs_ab); \
|
||||
ctype_r alphabeta_i = *(ab_i + i*rs_ab + j*cs_ab); \
|
||||
ctype_r gammat_r = *(ct_r + i*rs_ct + j*cs_ct); \
|
||||
ctype_r gammat_i = *(ct_i + i*rs_ct + j*cs_ct); \
|
||||
\
|
||||
PASTEMAC(chr,scals)( *beta_r, gammat_r ); \
|
||||
\
|
||||
PASTEMAC(chr,adds)( alphabeta_r, gammat_r ); \
|
||||
PASTEMAC(chr,subs)( alphabeta_i, gammat_r ); \
|
||||
PASTEMAC(chr,subs)( alphabeta_r, gammat_i ); \
|
||||
PASTEMAC(chr,subs)( alphabeta_i, gammat_i ); \
|
||||
\
|
||||
/* Store the local values (from ct) back to c. */ \
|
||||
PASTEMAC(ch,copyris)( gammat_r, \
|
||||
gammat_i, \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
} \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( gemm3m_ref_mxn, GEMM_UKERNEL )
|
||||
|
||||
50
frame/3/gemm/3m/ukernels/bli_gemm3m_ref_mxn.h
Normal file
50
frame/3/gemm/3m/ukernels/bli_gemm3m_ref_mxn.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( gemm3m_ref_mxn )
|
||||
|
||||
107
frame/3/gemm/4m/bli_gemm4m.c
Normal file
107
frame/3/gemm/4m/bli_gemm4m.c
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm4m_cntl;
|
||||
extern gemm_t* gemm_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_gemm4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
gemm_t* cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = gemm4m_cntl;
|
||||
else cntl = gemm_cntl;
|
||||
|
||||
bli_gemm_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, k, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm4m, gemm4m )
|
||||
|
||||
70
frame/3/gemm/4m/bli_gemm4m.h
Normal file
70
frame/3/gemm/4m/bli_gemm4m.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_gemm4m_cntl.h"
|
||||
|
||||
#include "bli_gemm4m_ref_mxn.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_gemm4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( gemm4m )
|
||||
|
||||
219
frame/3/gemm/4m/bli_gemm4m_cntl.c
Normal file
219
frame/3/gemm/4m/bli_gemm4m_cntl.c
Normal file
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
blksz_t* gemm4m_mc;
|
||||
blksz_t* gemm4m_nc;
|
||||
blksz_t* gemm4m_kc;
|
||||
blksz_t* gemm4m_mr;
|
||||
blksz_t* gemm4m_nr;
|
||||
blksz_t* gemm4m_kr;
|
||||
|
||||
func_t* gemm4m_ukrs;
|
||||
|
||||
packm_t* gemm4m_packa_cntl;
|
||||
packm_t* gemm4m_packb_cntl;
|
||||
|
||||
gemm_t* gemm4m_cntl_bp_ke;
|
||||
gemm_t* gemm4m_cntl_op_bp;
|
||||
gemm_t* gemm4m_cntl_mm_op;
|
||||
gemm_t* gemm4m_cntl_vl_mm;
|
||||
|
||||
gemm_t* gemm4m_cntl;
|
||||
|
||||
|
||||
void bli_gemm4m_cntl_init()
|
||||
{
|
||||
// Create blocksize objects for each dimension.
|
||||
gemm4m_mc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_MC_C, BLIS_EXTEND_4M_MC_C,
|
||||
BLIS_DEFAULT_4M_MC_Z, BLIS_EXTEND_4M_MC_Z );
|
||||
|
||||
gemm4m_nc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_NC_C, BLIS_EXTEND_4M_NC_C,
|
||||
BLIS_DEFAULT_4M_NC_Z, BLIS_EXTEND_4M_NC_Z );
|
||||
|
||||
gemm4m_kc
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_KC_C, BLIS_EXTEND_4M_KC_C,
|
||||
BLIS_DEFAULT_4M_KC_Z, BLIS_EXTEND_4M_KC_Z );
|
||||
|
||||
gemm4m_mr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_MR_C, BLIS_EXTEND_4M_MR_C,
|
||||
BLIS_DEFAULT_4M_MR_Z, BLIS_EXTEND_4M_MR_Z );
|
||||
|
||||
gemm4m_nr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_NR_C, BLIS_EXTEND_4M_NR_C,
|
||||
BLIS_DEFAULT_4M_NR_Z, BLIS_EXTEND_4M_NR_Z );
|
||||
|
||||
gemm4m_kr
|
||||
=
|
||||
bli_blksz_obj_create( 0, 0, 0, 0,
|
||||
BLIS_DEFAULT_4M_KR_C, BLIS_EXTEND_4M_KR_C,
|
||||
BLIS_DEFAULT_4M_KR_Z, BLIS_EXTEND_4M_KR_Z );
|
||||
|
||||
|
||||
|
||||
// Create function pointer object for each datatype-specific gemm
|
||||
// micro-kernel.
|
||||
gemm4m_ukrs = bli_func_obj_create( NULL,
|
||||
NULL,
|
||||
BLIS_CGEMM4M_UKERNEL,
|
||||
BLIS_ZGEMM4M_UKERNEL );
|
||||
|
||||
|
||||
// Create control tree objects for packm operations.
|
||||
gemm4m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
gemm4m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
gemm4m_kr,
|
||||
gemm4m_nr,
|
||||
TRUE, // densify; used by hemm/symm
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
//
|
||||
// Create a control tree for packing A and B, and streaming C.
|
||||
//
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
gemm4m_cntl_bp_ke
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm4m_ukrs,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem.
|
||||
gemm4m_cntl_op_bp
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm4m_packa_cntl,
|
||||
gemm4m_packb_cntl,
|
||||
NULL,
|
||||
gemm4m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates.
|
||||
gemm4m_cntl_mm_op
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm4m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm4m_cntl_op_bp,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems.
|
||||
gemm4m_cntl_vl_mm
|
||||
=
|
||||
bli_gemm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
gemm4m_cntl_mm_op,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" gemm control tree to a shorter name.
|
||||
gemm4m_cntl = gemm4m_cntl_vl_mm;
|
||||
|
||||
}
|
||||
|
||||
void bli_gemm4m_cntl_finalize()
|
||||
{
|
||||
bli_blksz_obj_free( gemm4m_mc );
|
||||
bli_blksz_obj_free( gemm4m_nc );
|
||||
bli_blksz_obj_free( gemm4m_kc );
|
||||
bli_blksz_obj_free( gemm4m_mr );
|
||||
bli_blksz_obj_free( gemm4m_nr );
|
||||
bli_blksz_obj_free( gemm4m_kr );
|
||||
|
||||
bli_func_obj_free( gemm4m_ukrs );
|
||||
|
||||
bli_cntl_obj_free( gemm4m_packa_cntl );
|
||||
bli_cntl_obj_free( gemm4m_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( gemm4m_cntl_bp_ke );
|
||||
bli_cntl_obj_free( gemm4m_cntl_op_bp );
|
||||
bli_cntl_obj_free( gemm4m_cntl_mm_op );
|
||||
bli_cntl_obj_free( gemm4m_cntl_vl_mm );
|
||||
}
|
||||
|
||||
37
frame/3/gemm/4m/bli_gemm4m_cntl.h
Normal file
37
frame/3/gemm/4m/bli_gemm4m_cntl.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemm4m_cntl_init( void );
|
||||
void bli_gemm4m_cntl_finalize( void );
|
||||
|
||||
192
frame/3/gemm/4m/ukernels/bli_gemm4m_ref_mxn.c
Normal file
192
frame/3/gemm/4m/ukernels/bli_gemm4m_ref_mxn.c
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#undef GENTFUNCCO
|
||||
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, gemmukr ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
) \
|
||||
{ \
|
||||
ctype_r ct_r[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
ctype_r ct_i[ PASTEMAC(chr,mr) * \
|
||||
PASTEMAC(chr,nr) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const inc_t rs_ct = 1; \
|
||||
const inc_t cs_ct = PASTEMAC(chr,mr); \
|
||||
\
|
||||
\
|
||||
const dim_t m = PASTEMAC(chr,mr); \
|
||||
const dim_t n = PASTEMAC(chr,nr); \
|
||||
\
|
||||
const inc_t ps_a = bli_auxinfo_ps_a( data ); \
|
||||
const inc_t ps_b = bli_auxinfo_ps_b( data ); \
|
||||
\
|
||||
ctype_r* restrict a_r = ( ctype_r* )a; \
|
||||
ctype_r* restrict a_i = ( ctype_r* )a + ps_a; \
|
||||
\
|
||||
ctype_r* restrict b_r = ( ctype_r* )b; \
|
||||
ctype_r* restrict b_i = ( ctype_r* )b + ps_b; \
|
||||
\
|
||||
ctype_r* restrict c_r = ( ctype_r* )c; \
|
||||
ctype_r* restrict c_i = ( ctype_r* )c + 1; \
|
||||
\
|
||||
const inc_t rs_c2 = 2 * rs_c; \
|
||||
const inc_t cs_c2 = 2 * cs_c; \
|
||||
\
|
||||
ctype_r* restrict one_r = PASTEMAC(chr,1); \
|
||||
\
|
||||
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
|
||||
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
|
||||
\
|
||||
ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \
|
||||
ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \
|
||||
\
|
||||
ctype_r m_alpha_r = -PASTEMAC(ch,real)( *alpha ); \
|
||||
\
|
||||
void* a_next = bli_auxinfo_next_a( data ); \
|
||||
void* b_next = bli_auxinfo_next_b( data ); \
|
||||
\
|
||||
dim_t i, j; \
|
||||
\
|
||||
\
|
||||
/* SAFETY CHECK: The higher level implementation should never
|
||||
allow an alpha with non-zero imaginary component to be passed
|
||||
in, because it can't be applied properly using the 4m method.
|
||||
If alpha is not real, then something is very wrong. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
|
||||
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
|
||||
\
|
||||
\
|
||||
/* Copy the contents of c to a temporary buffer ct. */ \
|
||||
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) \
|
||||
{ \
|
||||
/* We can handle a non-zero imaginary component on beta, but to do
|
||||
so we have to manually scale c and then use beta == 1 for the
|
||||
micro-kernel calls. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,scal2ris)( *beta_r, \
|
||||
*beta_i, \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
\
|
||||
/* Use beta.r == 1.0. */ \
|
||||
beta_r = one_r; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Copy c to ct without scaling. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2), \
|
||||
*(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct) ); \
|
||||
} \
|
||||
\
|
||||
\
|
||||
/* c.r = beta.r * c.r + alpha.r * a.r * b.r
|
||||
- alpha.r * a.i * b.i;
|
||||
c.i = beta.r * c.i + alpha.r * a.r * b.i
|
||||
+ alpha.r * a.i * b.r; */ \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_r, b_i, *data ); \
|
||||
\
|
||||
/* c.r = beta * c.r + a.r * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_r, \
|
||||
beta_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_r, *data ); \
|
||||
\
|
||||
/* c.i = beta * c.i + a.r * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_r, \
|
||||
b_i, \
|
||||
beta_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_i, b_i, *data ); \
|
||||
\
|
||||
/* c.i = 1.0 * c.i + a.i * b.r; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
alpha_r, \
|
||||
a_i, \
|
||||
b_r, \
|
||||
one_r, \
|
||||
ct_i, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
bli_auxinfo_set_next_ab( a_next, b_next, *data ); \
|
||||
\
|
||||
/* c.r = 1.0 * c.r - a.i * b.i; */ \
|
||||
PASTEMAC(chr,gemmukr)( k, \
|
||||
&m_alpha_r, \
|
||||
a_i, \
|
||||
b_i, \
|
||||
one_r, \
|
||||
ct_r, rs_ct, cs_ct, \
|
||||
data ); \
|
||||
\
|
||||
\
|
||||
/* Copy the final result in ct back to c. */ \
|
||||
for ( j = 0; j < n; ++j ) \
|
||||
for ( i = 0; i < m; ++i ) \
|
||||
PASTEMAC(ch,copyris)( *(ct_r + i*rs_ct + j*cs_ct), \
|
||||
*(ct_i + i*rs_ct + j*cs_ct), \
|
||||
*(c_r + i*rs_c2 + j*cs_c2), \
|
||||
*(c_i + i*rs_c2 + j*cs_c2) ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCCO_BASIC( gemm4m_ref_mxn, GEMM_UKERNEL )
|
||||
|
||||
50
frame/3/gemm/4m/ukernels/bli_gemm4m_ref_mxn.h
Normal file
50
frame/3/gemm/4m/ukernels/bli_gemm4m_ref_mxn.h
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
#undef GENTPROTCO
|
||||
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,varname)( \
|
||||
dim_t k, \
|
||||
ctype* restrict alpha, \
|
||||
ctype* restrict a, \
|
||||
ctype* restrict b, \
|
||||
ctype* restrict beta, \
|
||||
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
|
||||
auxinfo_t* data \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTCO_BASIC( gemm4m_ref_mxn )
|
||||
|
||||
@@ -45,6 +45,14 @@ void bli_gemm( obj_t* alpha,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_gemm4m( alpha, a, b, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_gemm_front( alpha, a, b, beta, c,
|
||||
gemm_cntl );
|
||||
}
|
||||
@@ -88,11 +96,11 @@ void PASTEMAC(ch,opname)( \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
PASTEMAC0(varname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( gemm, gemm )
|
||||
|
||||
@@ -47,6 +47,9 @@
|
||||
|
||||
#include "bli_gemm_ref_mxn.h"
|
||||
|
||||
#include "bli_gemm4m.h"
|
||||
#include "bli_gemm3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
@@ -89,6 +89,8 @@ void bli_gemm_cntl_init()
|
||||
BLIS_DEFAULT_KR_C, BLIS_EXTEND_KR_C,
|
||||
BLIS_DEFAULT_KR_Z, BLIS_EXTEND_KR_Z );
|
||||
|
||||
// Create function pointer object for each datatype-specific gemm
|
||||
// micro-kernel.
|
||||
gemm_ukrs = bli_func_obj_create( BLIS_SGEMM_UKERNEL,
|
||||
BLIS_DGEMM_UKERNEL,
|
||||
BLIS_CGEMM_UKERNEL,
|
||||
|
||||
105
frame/3/hemm/3m/bli_hemm3m.c
Normal file
105
frame/3/hemm/3m/bli_hemm3m.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm3m_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_hemm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_hemm_front( side, alpha, a, b, beta, c,
|
||||
gemm3m_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_conj( conja, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( hemm3m, hemm3m )
|
||||
|
||||
67
frame/3/hemm/3m/bli_hemm3m.h
Normal file
67
frame/3/hemm/3m/bli_hemm3m.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_hemm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( hemm3m )
|
||||
|
||||
105
frame/3/hemm/4m/bli_hemm4m.c
Normal file
105
frame/3/hemm/4m/bli_hemm4m.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm4m_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_hemm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_hemm_front( side, alpha, a, b, beta, c,
|
||||
gemm4m_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_conj( conja, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( hemm4m, hemm4m )
|
||||
|
||||
67
frame/3/hemm/4m/bli_hemm4m.h
Normal file
67
frame/3/hemm/4m/bli_hemm4m.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_hemm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( hemm4m )
|
||||
|
||||
@@ -46,6 +46,14 @@ void bli_hemm( side_t side,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_hemm4m( side, alpha, a, b, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_hemm_front( side, alpha, a, b, beta, c,
|
||||
gemm_cntl );
|
||||
}
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
#include "bli_hemm_check.h"
|
||||
#include "bli_hemm_front.h"
|
||||
|
||||
#include "bli_hemm4m.h"
|
||||
#include "bli_hemm3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
111
frame/3/her2k/3m/bli_her2k3m.c
Normal file
111
frame/3/her2k/3m/bli_her2k3m.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk3m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_her2k3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_her2k_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( her2k3m, her2k3m )
|
||||
|
||||
66
frame/3/her2k/3m/bli_her2k3m.h
Normal file
66
frame/3/her2k/3m/bli_her2k3m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_her2k3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTR_BASIC( her2k3m )
|
||||
|
||||
111
frame/3/her2k/4m/bli_her2k4m.c
Normal file
111
frame/3/her2k/4m/bli_her2k4m.c
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk4m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_her2k4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_her2k_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( her2k4m, her2k4m )
|
||||
|
||||
66
frame/3/her2k/4m/bli_her2k4m.h
Normal file
66
frame/3/her2k/4m/bli_her2k4m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_her2k4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTR_BASIC( her2k4m )
|
||||
|
||||
@@ -46,6 +46,14 @@ void bli_her2k( obj_t* alpha,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_her2k4m( alpha, a, b, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_her2k_front( alpha, a, b, beta, c,
|
||||
herk_cntl );
|
||||
}
|
||||
|
||||
@@ -49,6 +49,9 @@
|
||||
#include "bli_her2k_u_ker_var2.h"
|
||||
*/
|
||||
|
||||
#include "bli_her2k4m.h"
|
||||
#include "bli_her2k3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
103
frame/3/herk/3m/bli_herk3m.c
Normal file
103
frame/3/herk/3m/bli_herk3m.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk3m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_herk3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_herk_front( alpha, a, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype_r* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( herk3m, herk3m )
|
||||
|
||||
65
frame/3/herk/3m/bli_herk3m.h
Normal file
65
frame/3/herk/3m/bli_herk3m.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_herk3m_cntl.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_herk3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype_r* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTR_BASIC( herk3m )
|
||||
|
||||
158
frame/3/herk/3m/bli_herk3m_cntl.c
Normal file
158
frame/3/herk/3m/bli_herk3m_cntl.c
Normal file
@@ -0,0 +1,158 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
extern blksz_t* gemm3m_mc;
|
||||
extern blksz_t* gemm3m_nc;
|
||||
extern blksz_t* gemm3m_kc;
|
||||
extern blksz_t* gemm3m_mr;
|
||||
extern blksz_t* gemm3m_nr;
|
||||
extern blksz_t* gemm3m_kr;
|
||||
|
||||
extern func_t* gemm3m_ukrs;
|
||||
|
||||
packm_t* herk3m_packa_cntl;
|
||||
packm_t* herk3m_packb_cntl;
|
||||
|
||||
herk_t* herk3m_cntl_bp_ke;
|
||||
herk_t* herk3m_cntl_op_bp;
|
||||
herk_t* herk3m_cntl_mm_op;
|
||||
herk_t* herk3m_cntl_vl_mm;
|
||||
|
||||
herk_t* herk3m_cntl;
|
||||
|
||||
|
||||
void bli_herk3m_cntl_init()
|
||||
{
|
||||
// Create control tree objects for packm operations.
|
||||
herk3m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_mr,
|
||||
gemm3m_kr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
herk3m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kr,
|
||||
gemm3m_nr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
herk3m_cntl_bp_ke
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm3m_ukrs,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem.
|
||||
herk3m_cntl_op_bp
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
herk3m_packa_cntl,
|
||||
herk3m_packb_cntl,
|
||||
NULL,
|
||||
herk3m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates.
|
||||
herk3m_cntl_mm_op
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
herk3m_cntl_op_bp,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems.
|
||||
herk3m_cntl_vl_mm
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
herk3m_cntl_mm_op,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" herk control tree to a shorter name.
|
||||
herk3m_cntl = herk3m_cntl_vl_mm;
|
||||
}
|
||||
|
||||
void bli_herk3m_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( herk3m_packa_cntl );
|
||||
bli_cntl_obj_free( herk3m_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( herk3m_cntl_bp_ke );
|
||||
bli_cntl_obj_free( herk3m_cntl_op_bp );
|
||||
bli_cntl_obj_free( herk3m_cntl_mm_op );
|
||||
bli_cntl_obj_free( herk3m_cntl_vl_mm );
|
||||
}
|
||||
|
||||
37
frame/3/herk/3m/bli_herk3m_cntl.h
Normal file
37
frame/3/herk/3m/bli_herk3m_cntl.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_herk3m_cntl_init( void );
|
||||
void bli_herk3m_cntl_finalize( void );
|
||||
|
||||
103
frame/3/herk/4m/bli_herk4m.c
Normal file
103
frame/3/herk/4m/bli_herk4m.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk4m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_herk4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_herk_front( alpha, a, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNCR
|
||||
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype_r* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt_r = PASTEMAC(chr,type); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt_r, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_HERMITIAN, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNCR_BASIC( herk4m, herk4m )
|
||||
|
||||
65
frame/3/herk/4m/bli_herk4m.h
Normal file
65
frame/3/herk/4m/bli_herk4m.h
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_herk4m_cntl.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_herk4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROTR
|
||||
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype_r* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype_r* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROTR_BASIC( herk4m )
|
||||
|
||||
158
frame/3/herk/4m/bli_herk4m_cntl.c
Normal file
158
frame/3/herk/4m/bli_herk4m_cntl.c
Normal file
@@ -0,0 +1,158 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
extern blksz_t* gemm4m_mc;
|
||||
extern blksz_t* gemm4m_nc;
|
||||
extern blksz_t* gemm4m_kc;
|
||||
extern blksz_t* gemm4m_mr;
|
||||
extern blksz_t* gemm4m_nr;
|
||||
extern blksz_t* gemm4m_kr;
|
||||
|
||||
extern func_t* gemm4m_ukrs;
|
||||
|
||||
packm_t* herk4m_packa_cntl;
|
||||
packm_t* herk4m_packb_cntl;
|
||||
|
||||
herk_t* herk4m_cntl_bp_ke;
|
||||
herk_t* herk4m_cntl_op_bp;
|
||||
herk_t* herk4m_cntl_mm_op;
|
||||
herk_t* herk4m_cntl_vl_mm;
|
||||
|
||||
herk_t* herk4m_cntl;
|
||||
|
||||
|
||||
void bli_herk4m_cntl_init()
|
||||
{
|
||||
// Create control tree objects for packm operations.
|
||||
herk4m_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
gemm4m_mr,
|
||||
gemm4m_kr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
herk4m_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
gemm4m_kr,
|
||||
gemm4m_nr,
|
||||
FALSE, // already dense; densify not necessary
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
herk4m_cntl_bp_ke
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm4m_ukrs,
|
||||
NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem.
|
||||
herk4m_cntl_op_bp
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
herk4m_packa_cntl,
|
||||
herk4m_packb_cntl,
|
||||
NULL,
|
||||
herk4m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates.
|
||||
herk4m_cntl_mm_op
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm4m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
herk4m_cntl_op_bp,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems.
|
||||
herk4m_cntl_vl_mm
|
||||
=
|
||||
bli_herk_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
herk4m_cntl_mm_op,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" herk control tree to a shorter name.
|
||||
herk4m_cntl = herk4m_cntl_vl_mm;
|
||||
}
|
||||
|
||||
void bli_herk4m_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( herk4m_packa_cntl );
|
||||
bli_cntl_obj_free( herk4m_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( herk4m_cntl_bp_ke );
|
||||
bli_cntl_obj_free( herk4m_cntl_op_bp );
|
||||
bli_cntl_obj_free( herk4m_cntl_mm_op );
|
||||
bli_cntl_obj_free( herk4m_cntl_vl_mm );
|
||||
}
|
||||
|
||||
37
frame/3/herk/4m/bli_herk4m_cntl.h
Normal file
37
frame/3/herk/4m/bli_herk4m_cntl.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_herk4m_cntl_init( void );
|
||||
void bli_herk4m_cntl_finalize( void );
|
||||
|
||||
@@ -44,6 +44,14 @@ void bli_herk( obj_t* alpha,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_herk4m( alpha, a, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_herk_front( alpha, a, beta, c,
|
||||
herk_cntl );
|
||||
}
|
||||
|
||||
@@ -47,6 +47,9 @@
|
||||
#include "bli_herk_l_ker_var2.h"
|
||||
#include "bli_herk_u_ker_var2.h"
|
||||
|
||||
#include "bli_herk4m.h"
|
||||
#include "bli_herk3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
105
frame/3/symm/3m/bli_symm3m.c
Normal file
105
frame/3/symm/3m/bli_symm3m.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm3m_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_symm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_symm_front( side, alpha, a, b, beta, c,
|
||||
gemm3m_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_conj( conja, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( symm3m, symm3m )
|
||||
|
||||
67
frame/3/symm/3m/bli_symm3m.h
Normal file
67
frame/3/symm/3m/bli_symm3m.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_symm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( symm3m )
|
||||
|
||||
105
frame/3/symm/4m/bli_symm4m.c
Normal file
105
frame/3/symm/4m/bli_symm4m.c
Normal file
@@ -0,0 +1,105 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern gemm_t* gemm4m_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_symm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
bli_symm_front( side, alpha, a, b, beta, c,
|
||||
gemm4m_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
bli_set_dims_with_trans( transb, m, n, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_conj( conja, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( symm4m, symm4m )
|
||||
|
||||
67
frame/3/symm/4m/bli_symm4m.h
Normal file
67
frame/3/symm/4m/bli_symm4m.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_symm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
conj_t conja, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( symm4m )
|
||||
|
||||
@@ -46,6 +46,14 @@ void bli_symm( side_t side,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_symm4m( side, alpha, a, b, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_symm_front( side, alpha, a, b, beta, c,
|
||||
gemm_cntl );
|
||||
}
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
#include "bli_symm_check.h"
|
||||
#include "bli_symm_front.h"
|
||||
|
||||
#include "bli_symm4m.h"
|
||||
#include "bli_symm3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
110
frame/3/syr2k/3m/bli_syr2k3m.c
Normal file
110
frame/3/syr2k/3m/bli_syr2k3m.c
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk3m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_syr2k3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_syr2k_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( syr2k3m, syr2k3m )
|
||||
|
||||
66
frame/3/syr2k/3m/bli_syr2k3m.h
Normal file
66
frame/3/syr2k/3m/bli_syr2k3m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_syr2k3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( syr2k3m )
|
||||
|
||||
110
frame/3/syr2k/4m/bli_syr2k4m.c
Normal file
110
frame/3/syr2k/4m/bli_syr2k4m.c
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk4m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_syr2k4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_syr2k_front( alpha, a, b, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
dim_t m_b, n_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
bli_set_dims_with_trans( transb, m, k, m_b, n_b ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
bli_obj_set_conjtrans( transb, bo ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( syr2k4m, syr2k4m )
|
||||
|
||||
66
frame/3/syr2k/4m/bli_syr2k4m.h
Normal file
66
frame/3/syr2k/4m/bli_syr2k4m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_syr2k4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
trans_t transb, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( syr2k4m )
|
||||
|
||||
@@ -46,6 +46,14 @@ void bli_syr2k( obj_t* alpha,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_syr2k4m( alpha, a, b, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_syr2k_front( alpha, a, b, beta, c,
|
||||
herk_cntl );
|
||||
}
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
#include "bli_syr2k_check.h"
|
||||
#include "bli_syr2k_front.h"
|
||||
|
||||
#include "bli_syr2k4m.h"
|
||||
#include "bli_syr2k3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
102
frame/3/syrk/3m/bli_syrk3m.c
Normal file
102
frame/3/syrk/3m/bli_syrk3m.c
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk3m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_syrk3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk3m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_syrk_front( alpha, a, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( syrk3m, syrk3m )
|
||||
|
||||
63
frame/3/syrk/3m/bli_syrk3m.h
Normal file
63
frame/3/syrk/3m/bli_syrk3m.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_syrk3m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( syrk3m )
|
||||
|
||||
102
frame/3/syrk/4m/bli_syrk4m.c
Normal file
102
frame/3/syrk/4m/bli_syrk4m.c
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern herk_t* herk4m_cntl;
|
||||
extern herk_t* herk_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_syrk4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
herk_t* cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *c ) ) cntl = herk4m_cntl;
|
||||
else cntl = herk_cntl;
|
||||
|
||||
bli_syrk_front( alpha, a, beta, c,
|
||||
cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, betao, co; \
|
||||
\
|
||||
dim_t m_a, n_a; \
|
||||
\
|
||||
bli_set_dims_with_trans( transa, m, k, m_a, n_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploc, co ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_SYMMETRIC, co ); \
|
||||
\
|
||||
PASTEMAC0(opname)( &alphao, \
|
||||
&ao, \
|
||||
&betao, \
|
||||
&co ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( syrk4m, syrk4m )
|
||||
|
||||
63
frame/3/syrk/4m/bli_syrk4m.h
Normal file
63
frame/3/syrk/4m/bli_syrk4m.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_syrk4m( obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* beta,
|
||||
obj_t* c );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
uplo_t uploc, \
|
||||
trans_t transa, \
|
||||
dim_t m, \
|
||||
dim_t k, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* beta, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( syrk4m )
|
||||
|
||||
@@ -44,6 +44,14 @@ void bli_syrk( obj_t* alpha,
|
||||
obj_t* beta,
|
||||
obj_t* c )
|
||||
{
|
||||
#ifdef BLIS_ENABLE_COMPLEX_VIA_4M
|
||||
if ( bli_obj_is_complex( *c ) )
|
||||
{
|
||||
bli_syrk4m( alpha, a, beta, c );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
bli_syrk_front( alpha, a, beta, c,
|
||||
herk_cntl );
|
||||
}
|
||||
|
||||
@@ -35,6 +35,9 @@
|
||||
#include "bli_syrk_check.h"
|
||||
#include "bli_syrk_front.h"
|
||||
|
||||
#include "bli_syrk4m.h"
|
||||
#include "bli_syrk3m.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
|
||||
107
frame/3/trmm/3m/bli_trmm3m.c
Normal file
107
frame/3/trmm/3m/bli_trmm3m.c
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm3m_l_cntl;
|
||||
extern trmm_t* trmm3m_r_cntl;
|
||||
extern trmm_t* trmm_l_cntl;
|
||||
extern trmm_t* trmm_r_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_trmm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b )
|
||||
{
|
||||
trmm_t* l_cntl;
|
||||
trmm_t* r_cntl;
|
||||
|
||||
// Since 3m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *b ) ) { l_cntl = trmm3m_l_cntl; r_cntl = trmm3m_r_cntl; }
|
||||
else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; }
|
||||
|
||||
bli_trmm_front( side, alpha, a, b,
|
||||
l_cntl,
|
||||
r_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
trans_t transa, \
|
||||
diag_t diaga, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_diag( diaga, ao ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm3m, trmm3m )
|
||||
|
||||
66
frame/3/trmm/3m/bli_trmm3m.h
Normal file
66
frame/3/trmm/3m/bli_trmm3m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_trmm3m_cntl.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_trmm3m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
trans_t transa, \
|
||||
diag_t diaga, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm3m )
|
||||
|
||||
262
frame/3/trmm/3m/bli_trmm3m_cntl.c
Normal file
262
frame/3/trmm/3m/bli_trmm3m_cntl.c
Normal file
@@ -0,0 +1,262 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
extern blksz_t* gemm3m_mc;
|
||||
extern blksz_t* gemm3m_nc;
|
||||
extern blksz_t* gemm3m_kc;
|
||||
extern blksz_t* gemm3m_mr;
|
||||
extern blksz_t* gemm3m_nr;
|
||||
extern blksz_t* gemm3m_kr;
|
||||
|
||||
extern func_t* gemm3m_ukrs;
|
||||
|
||||
extern gemm_t* gemm3m_cntl_bp_ke;
|
||||
|
||||
packm_t* trmm3m_l_packa_cntl;
|
||||
packm_t* trmm3m_l_packb_cntl;
|
||||
|
||||
packm_t* trmm3m_r_packa_cntl;
|
||||
packm_t* trmm3m_r_packb_cntl;
|
||||
|
||||
trmm_t* trmm3m_cntl_bp_ke;
|
||||
|
||||
trmm_t* trmm3m_l_cntl_op_bp;
|
||||
trmm_t* trmm3m_l_cntl_mm_op;
|
||||
trmm_t* trmm3m_l_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm3m_r_cntl_op_bp;
|
||||
trmm_t* trmm3m_r_cntl_mm_op;
|
||||
trmm_t* trmm3m_r_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm3m_l_cntl;
|
||||
trmm_t* trmm3m_r_cntl;
|
||||
|
||||
|
||||
void bli_trmm3m_cntl_init()
|
||||
{
|
||||
// Create control tree objects for packm operations (left side).
|
||||
trmm3m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm3m_mr,
|
||||
gemm3m_mr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
trmm3m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm3m_mr,
|
||||
gemm3m_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
// Create control tree objects for packm operations (right side).
|
||||
trmm3m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm3m_mr,
|
||||
gemm3m_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
trmm3m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm3m_nr,
|
||||
gemm3m_nr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_3M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
trmm3m_cntl_bp_ke
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm3m_ukrs,
|
||||
NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem (left side).
|
||||
trmm3m_l_cntl_op_bp
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_l_packa_cntl,
|
||||
trmm3m_l_packb_cntl,
|
||||
NULL,
|
||||
trmm3m_cntl_bp_ke,
|
||||
gemm3m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates (left side).
|
||||
trmm3m_l_cntl_mm_op
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_l_cntl_op_bp,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems (left side).
|
||||
trmm3m_l_cntl_vl_mm
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_l_cntl_mm_op,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem (right side).
|
||||
trmm3m_r_cntl_op_bp
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm3m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_r_packa_cntl,
|
||||
trmm3m_r_packb_cntl,
|
||||
NULL,
|
||||
trmm3m_cntl_bp_ke,
|
||||
gemm3m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates (right side).
|
||||
trmm3m_r_cntl_mm_op
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm3m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_r_cntl_op_bp,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems (right side).
|
||||
trmm3m_r_cntl_vl_mm
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm3m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm3m_r_cntl_mm_op,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" trmm control trees to shorter names.
|
||||
trmm3m_l_cntl = trmm3m_l_cntl_vl_mm;
|
||||
trmm3m_r_cntl = trmm3m_r_cntl_vl_mm;
|
||||
}
|
||||
|
||||
void bli_trmm3m_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trmm3m_l_packa_cntl );
|
||||
bli_cntl_obj_free( trmm3m_l_packb_cntl );
|
||||
bli_cntl_obj_free( trmm3m_r_packa_cntl );
|
||||
bli_cntl_obj_free( trmm3m_r_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( trmm3m_cntl_bp_ke );
|
||||
|
||||
bli_cntl_obj_free( trmm3m_l_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm3m_l_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm3m_l_cntl_vl_mm );
|
||||
bli_cntl_obj_free( trmm3m_r_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm3m_r_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm3m_r_cntl_vl_mm );
|
||||
}
|
||||
|
||||
36
frame/3/trmm/3m/bli_trmm3m_cntl.h
Normal file
36
frame/3/trmm/3m/bli_trmm3m_cntl.h
Normal file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_trmm3m_cntl_init( void );
|
||||
void bli_trmm3m_cntl_finalize( void );
|
||||
107
frame/3/trmm/4m/bli_trmm4m.c
Normal file
107
frame/3/trmm/4m/bli_trmm4m.c
Normal file
@@ -0,0 +1,107 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern trmm_t* trmm4m_l_cntl;
|
||||
extern trmm_t* trmm4m_r_cntl;
|
||||
extern trmm_t* trmm_l_cntl;
|
||||
extern trmm_t* trmm_r_cntl;
|
||||
|
||||
//
|
||||
// Define object-based interface.
|
||||
//
|
||||
void bli_trmm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b )
|
||||
{
|
||||
trmm_t* l_cntl;
|
||||
trmm_t* r_cntl;
|
||||
|
||||
// Since 4m only applies to the complex domain, we use the regular
|
||||
// control tree for real domain cases.
|
||||
if ( bli_obj_is_complex( *b ) ) { l_cntl = trmm4m_l_cntl; r_cntl = trmm4m_r_cntl; }
|
||||
else { l_cntl = trmm_l_cntl; r_cntl = trmm_r_cntl; }
|
||||
|
||||
bli_trmm_front( side, alpha, a, b,
|
||||
l_cntl,
|
||||
r_cntl );
|
||||
}
|
||||
|
||||
//
|
||||
// Define BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
trans_t transa, \
|
||||
diag_t diaga, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b \
|
||||
) \
|
||||
{ \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao, ao, bo; \
|
||||
\
|
||||
dim_t mn_a; \
|
||||
\
|
||||
bli_set_dim_with_side( side, m, n, mn_a ); \
|
||||
\
|
||||
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
|
||||
\
|
||||
bli_obj_create_with_attached_buffer( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_create_with_attached_buffer( dt, m, n, b, rs_b, cs_b, &bo ); \
|
||||
\
|
||||
bli_obj_set_uplo( uploa, ao ); \
|
||||
bli_obj_set_diag( diaga, ao ); \
|
||||
bli_obj_set_conjtrans( transa, ao ); \
|
||||
\
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, ao ); \
|
||||
\
|
||||
PASTEMAC0(opname)( side, \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo ); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC( trmm4m, trmm4m )
|
||||
|
||||
66
frame/3/trmm/4m/bli_trmm4m.h
Normal file
66
frame/3/trmm/4m/bli_trmm4m.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_trmm4m_cntl.h"
|
||||
|
||||
|
||||
//
|
||||
// Prototype object-based interface.
|
||||
//
|
||||
void bli_trmm4m( side_t side,
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b );
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with homogeneous-typed operands.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
\
|
||||
void PASTEMAC(ch,opname)( \
|
||||
side_t side, \
|
||||
uplo_t uploa, \
|
||||
trans_t transa, \
|
||||
diag_t diaga, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
ctype* alpha, \
|
||||
ctype* a, inc_t rs_a, inc_t cs_a, \
|
||||
ctype* b, inc_t rs_b, inc_t cs_b \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC( trmm4m )
|
||||
|
||||
262
frame/3/trmm/4m/bli_trmm4m_cntl.c
Normal file
262
frame/3/trmm/4m/bli_trmm4m_cntl.c
Normal file
@@ -0,0 +1,262 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
extern scalm_t* scalm_cntl;
|
||||
|
||||
extern blksz_t* gemm4m_mc;
|
||||
extern blksz_t* gemm4m_nc;
|
||||
extern blksz_t* gemm4m_kc;
|
||||
extern blksz_t* gemm4m_mr;
|
||||
extern blksz_t* gemm4m_nr;
|
||||
extern blksz_t* gemm4m_kr;
|
||||
|
||||
extern func_t* gemm4m_ukrs;
|
||||
|
||||
extern gemm_t* gemm4m_cntl_bp_ke;
|
||||
|
||||
packm_t* trmm4m_l_packa_cntl;
|
||||
packm_t* trmm4m_l_packb_cntl;
|
||||
|
||||
packm_t* trmm4m_r_packa_cntl;
|
||||
packm_t* trmm4m_r_packb_cntl;
|
||||
|
||||
trmm_t* trmm4m_cntl_bp_ke;
|
||||
|
||||
trmm_t* trmm4m_l_cntl_op_bp;
|
||||
trmm_t* trmm4m_l_cntl_mm_op;
|
||||
trmm_t* trmm4m_l_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm4m_r_cntl_op_bp;
|
||||
trmm_t* trmm4m_r_cntl_mm_op;
|
||||
trmm_t* trmm4m_r_cntl_vl_mm;
|
||||
|
||||
trmm_t* trmm4m_l_cntl;
|
||||
trmm_t* trmm4m_r_cntl;
|
||||
|
||||
|
||||
void bli_trmm4m_cntl_init()
|
||||
{
|
||||
// Create control tree objects for packm operations (left side).
|
||||
trmm4m_l_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to mr.
|
||||
gemm4m_mr,
|
||||
gemm4m_mr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
trmm4m_l_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
// IMPORTANT: m dim multiple here must be mr
|
||||
// since "k" dim multiple is set to mr above.
|
||||
gemm4m_mr,
|
||||
gemm4m_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
// Create control tree objects for packm operations (right side).
|
||||
trmm4m_r_packa_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
// IMPORTANT: for consistency with trsm, "k" dim
|
||||
// multiple is set to nr.
|
||||
gemm4m_mr,
|
||||
gemm4m_nr,
|
||||
FALSE, // already dense
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_ROW_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_A_BLOCK );
|
||||
|
||||
trmm4m_r_packb_cntl
|
||||
=
|
||||
bli_packm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT4,
|
||||
// IMPORTANT: m dim multiple here must be nr
|
||||
// since "k" dim multiple is set to nr above.
|
||||
gemm4m_nr,
|
||||
gemm4m_nr,
|
||||
TRUE, // densify
|
||||
FALSE, // do NOT invert diagonal
|
||||
FALSE, // reverse iteration if upper?
|
||||
FALSE, // reverse iteration if lower?
|
||||
BLIS_PACKED_COL_PANELS_4M,
|
||||
BLIS_BUFFER_FOR_B_PANEL );
|
||||
|
||||
|
||||
// Create control tree object for lowest-level block-panel kernel.
|
||||
trmm4m_cntl_bp_ke
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_UNB_OPT,
|
||||
BLIS_VARIANT2,
|
||||
NULL,
|
||||
gemm4m_ukrs,
|
||||
NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem (left side).
|
||||
trmm4m_l_cntl_op_bp
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_l_packa_cntl,
|
||||
trmm4m_l_packb_cntl,
|
||||
NULL,
|
||||
trmm4m_cntl_bp_ke,
|
||||
gemm4m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates (left side).
|
||||
trmm4m_l_cntl_mm_op
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm4m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_l_cntl_op_bp,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems (left side).
|
||||
trmm4m_l_cntl_vl_mm
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_l_cntl_mm_op,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for outer panel (to block-panel)
|
||||
// problem (right side).
|
||||
trmm4m_r_cntl_op_bp
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT1,
|
||||
gemm4m_mc,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_r_packa_cntl,
|
||||
trmm4m_r_packb_cntl,
|
||||
NULL,
|
||||
trmm4m_cntl_bp_ke,
|
||||
gemm4m_cntl_bp_ke,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for general problem via multiple
|
||||
// rank-k (outer panel) updates (right side).
|
||||
trmm4m_r_cntl_mm_op
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT3,
|
||||
gemm4m_kc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_r_cntl_op_bp,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Create control tree object for very large problem via multiple
|
||||
// general problems (right side).
|
||||
trmm4m_r_cntl_vl_mm
|
||||
=
|
||||
bli_trmm_cntl_obj_create( BLIS_BLOCKED,
|
||||
BLIS_VARIANT2,
|
||||
gemm4m_nc,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
trmm4m_r_cntl_mm_op,
|
||||
NULL,
|
||||
NULL );
|
||||
|
||||
// Alias the "master" trmm control trees to shorter names.
|
||||
trmm4m_l_cntl = trmm4m_l_cntl_vl_mm;
|
||||
trmm4m_r_cntl = trmm4m_r_cntl_vl_mm;
|
||||
}
|
||||
|
||||
void bli_trmm4m_cntl_finalize()
|
||||
{
|
||||
bli_cntl_obj_free( trmm4m_l_packa_cntl );
|
||||
bli_cntl_obj_free( trmm4m_l_packb_cntl );
|
||||
bli_cntl_obj_free( trmm4m_r_packa_cntl );
|
||||
bli_cntl_obj_free( trmm4m_r_packb_cntl );
|
||||
|
||||
bli_cntl_obj_free( trmm4m_cntl_bp_ke );
|
||||
|
||||
bli_cntl_obj_free( trmm4m_l_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm4m_l_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm4m_l_cntl_vl_mm );
|
||||
bli_cntl_obj_free( trmm4m_r_cntl_op_bp );
|
||||
bli_cntl_obj_free( trmm4m_r_cntl_mm_op );
|
||||
bli_cntl_obj_free( trmm4m_r_cntl_vl_mm );
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user