mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Added framework support and interface APIs for GEMMT
Details: - Added new API Which Computes a matrix-matrix product with general matrices but updates only the upper or lower triangular part of the result matrix. cblas_?gemmt() and ?gemmt_(). - These routines are similar to the ?gemm routines, but they only access and update a triangular part of the square result matrix. - Added DGEMMT functionality by reusing GEMM kernels. - Created a new folder for GEMMT under l3, and added GEMMT specific framework code. - Modified cntl_create routine to choose different macro kernel for GEMMT. - Added routines to copy lower/upper triangular part of a block to the buffer. - Defined BLIS, BLAS and CBLAS interface APIs for GEMMT. - Added test_gemmt.c to test folder and Updated the Makefile. - Added a macro 'CBLAS' in test_gemm.c to call CBLAS APIs. Change-Id: Ie00c1a15b9c654b65c687a9ca781cbc6f9641791
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -97,4 +97,4 @@
|
||||
#include "bli_trmm.h"
|
||||
#include "bli_trmm3.h"
|
||||
#include "bli_trsm.h"
|
||||
|
||||
#include "bli_gemmt.h"
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -55,7 +55,8 @@ void bli_l3_cntl_create_if
|
||||
{
|
||||
if ( family == BLIS_GEMM ||
|
||||
family == BLIS_HERK ||
|
||||
family == BLIS_TRMM )
|
||||
family == BLIS_TRMM ||
|
||||
family == BLIS_GEMMT)
|
||||
{
|
||||
*cntl_use = bli_gemm_cntl_create( rntm, family, schema_a, schema_b );
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -124,6 +124,68 @@ void PASTEMAC(opname,EX_SUF) \
|
||||
|
||||
GENFRONT( gemm )
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
\
|
||||
void PASTEMAC(opname,EX_SUF) \
|
||||
( \
|
||||
obj_t* alpha, \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* beta, \
|
||||
obj_t* c \
|
||||
BLIS_OAPI_EX_PARAMS \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
bli_init_once(); \
|
||||
\
|
||||
BLIS_OAPI_EX_DECLS \
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_2, alpha, a, b, beta, c); \
|
||||
\
|
||||
/* If C has a zero dimension, return early. */ \
|
||||
if ( bli_obj_has_zero_dim( c ) ) {\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
return; \
|
||||
}\
|
||||
\
|
||||
/* if alpha or A or B has a zero dimension, \
|
||||
scale C by beta and return early. */ \
|
||||
if ( bli_obj_equals( alpha, &BLIS_ZERO ) || \
|
||||
bli_obj_has_zero_dim( a ) || \
|
||||
bli_obj_has_zero_dim( b ) ) \
|
||||
{\
|
||||
bli_scalm( beta, c ); \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2) \
|
||||
return;\
|
||||
}\
|
||||
\
|
||||
/* Only proceed with an induced method if each of the operands have a
|
||||
complex storage datatype. NOTE: Allowing precisions to vary while
|
||||
using 1m, which is what we do here, is unique to gemm; other level-3
|
||||
operations use 1m only if all storage datatypes are equal (and they
|
||||
ignore the computation precision). If any operands are real, skip the
|
||||
induced method chooser function and proceed directly with native
|
||||
execution. */ \
|
||||
if ( bli_obj_is_complex( c ) && \
|
||||
bli_obj_is_complex( a ) && \
|
||||
bli_obj_is_complex( b ) ) \
|
||||
{ \
|
||||
/* Invoke the operation's "ind" function--its induced method front-end.
|
||||
For complex problems, it calls the highest priority induced method
|
||||
that is available (ie: implemented and enabled), and if none are
|
||||
enabled, it calls native execution. (For real problems, it calls
|
||||
the operation's native execution interface.) */ \
|
||||
PASTEMAC(opname,ind)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
|
||||
} \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
|
||||
}
|
||||
GENFRONT( gemmt )
|
||||
|
||||
#undef GENFRONT
|
||||
#define GENFRONT( opname ) \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -51,6 +52,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
|
||||
);
|
||||
|
||||
GENPROT( gemm )
|
||||
GENPROT( gemmt )
|
||||
GENPROT( her2k )
|
||||
GENPROT( syr2k )
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -57,6 +58,7 @@ typedef void (*PASTECH(opname,_oft)) \
|
||||
);
|
||||
|
||||
GENTDEF( gemm )
|
||||
GENTDEF( gemmt )
|
||||
GENTDEF( her2k )
|
||||
GENTDEF( syr2k )
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -100,7 +100,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemm )
|
||||
|
||||
INSERT_GENTFUNC_BASIC0( gemmt )
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, struca ) \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -56,7 +57,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemm )
|
||||
|
||||
INSERT_GENTPROT_BASIC0( gemmt )
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname ) \
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -65,6 +65,7 @@ cntl_t* bli_gemmbp_cntl_create
|
||||
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
|
||||
else if ( family == BLIS_HERK ) macro_kernel_fp = bli_herk_x_ker_var2;
|
||||
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
|
||||
else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_ker_var2;
|
||||
else /* should never execute */ macro_kernel_fp = NULL;
|
||||
|
||||
packa_fp = bli_packm_blk_var1;
|
||||
|
||||
37
frame/3/gemmt/bli_gemmt.h
Normal file
37
frame/3/gemmt/bli_gemmt.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_gemmt_front.h"
|
||||
|
||||
#include "bli_gemmt_var.h"
|
||||
365
frame/3/gemmt/bli_gemmt_front.c
Normal file
365
frame/3/gemmt/bli_gemmt_front.c
Normal file
@@ -0,0 +1,365 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_gemmt_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_3, alpha, a, b, beta, c);
|
||||
bli_init_once();
|
||||
|
||||
obj_t a_local;
|
||||
obj_t b_local;
|
||||
obj_t c_local;
|
||||
|
||||
|
||||
// Check parameters.
|
||||
if ( bli_error_checking_is_enabled() )
|
||||
bli_gemm_check( alpha, a, b, beta, c, cntx );
|
||||
|
||||
// Alias A, B, and C in case we need to apply transformations.
|
||||
bli_obj_alias_to( a, &a_local );
|
||||
bli_obj_alias_to( b, &b_local );
|
||||
bli_obj_alias_to( c, &c_local );
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
cntx_t cntx_local;
|
||||
|
||||
// If any of the storage datatypes differ, or if the computation precision
|
||||
// differs from the storage precision of C, utilize the mixed datatype
|
||||
// code path.
|
||||
// NOTE: If we ever want to support the caller setting the computation
|
||||
// domain explicitly, we will need to check the computation dt against the
|
||||
// storage dt of C (instead of the computation precision against the
|
||||
// storage precision of C).
|
||||
if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
|
||||
bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
|
||||
bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
|
||||
{
|
||||
// Handle mixed datatype cases in bli_gemm_md(), which may modify
|
||||
// the objects or the context. (If the context is modified, cntx
|
||||
// is adjusted to point to cntx_local.)
|
||||
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
|
||||
}
|
||||
//else // homogeneous datatypes
|
||||
#endif
|
||||
|
||||
// Load the pack schemas from the context and embed them into the objects
|
||||
// for A and B. (Native contexts are initialized with the correct pack
|
||||
// schemas, as are contexts for 1m, and if necessary bli_gemm_md() would
|
||||
// have made a copy and modified the schemas, so reading them from the
|
||||
// context should be a safe bet at this point.) This is a sort of hack for
|
||||
// communicating the desired pack schemas to bli_gemm_cntl_create() (via
|
||||
// bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows us
|
||||
// to subsequently access the schemas from the control tree, which
|
||||
// hopefully reduces some confusion, particularly in bli_packm_init().
|
||||
const pack_t schema_a = bli_cntx_schema_a_block( cntx );
|
||||
const pack_t schema_b = bli_cntx_schema_b_panel( cntx );
|
||||
|
||||
bli_obj_set_pack_schema( schema_a, &a_local );
|
||||
bli_obj_set_pack_schema( schema_b, &b_local );
|
||||
|
||||
// Next, we handle the possibility of needing to typecast alpha to the
|
||||
// computation datatype and/or beta to the storage datatype of C.
|
||||
|
||||
// Attach alpha to B, and in the process typecast alpha to the target
|
||||
// datatype of the matrix (which in this case is equal to the computation
|
||||
// datatype).
|
||||
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
|
||||
|
||||
// Attach beta to C, and in the process typecast beta to the target
|
||||
// datatype of the matrix (which in this case is equal to the storage
|
||||
// datatype of C).
|
||||
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local );
|
||||
|
||||
// Change the alpha and beta pointers to BLIS_ONE since the values have
|
||||
// now been typecast and attached to the matrices above.
|
||||
alpha = &BLIS_ONE;
|
||||
beta = &BLIS_ONE;
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// Don't perform the following optimization for ccr or crc cases, as
|
||||
// those cases are sensitive to the ukernel storage preference (ie:
|
||||
// transposing the operation would break them).
|
||||
if ( !bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) )
|
||||
#endif
|
||||
// An optimization: If C is stored by rows and the micro-kernel prefers
|
||||
// contiguous columns, or if C is stored by columns and the micro-kernel
|
||||
// prefers contiguous rows, transpose the entire operation to allow the
|
||||
// micro-kernel to access elements of C in its preferred manner.
|
||||
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
|
||||
{
|
||||
bli_obj_swap( &a_local, &b_local );
|
||||
|
||||
bli_obj_induce_trans( &a_local );
|
||||
bli_obj_induce_trans( &b_local );
|
||||
bli_obj_induce_trans( &c_local );
|
||||
|
||||
// We must also swap the pack schemas, which were set by bli_gemm_md()
|
||||
// or the inlined code above.
|
||||
bli_obj_swap_pack_schemas( &a_local, &b_local );
|
||||
}
|
||||
|
||||
// Parse and interpret the contents of the rntm_t object to properly
|
||||
// set the ways of parallelism for each loop, and then make any
|
||||
// additional modifications necessary for the current operation.
|
||||
bli_rntm_set_ways_for_op
|
||||
(
|
||||
BLIS_GEMMT,
|
||||
BLIS_LEFT, // ignored for gemm/hemm/symm
|
||||
bli_obj_length( &c_local ),
|
||||
bli_obj_width( &c_local ),
|
||||
bli_obj_width( &a_local ),
|
||||
rntm
|
||||
);
|
||||
|
||||
obj_t* cp = &c_local;
|
||||
obj_t* betap = beta;
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If any of the following conditions are met, create a temporary matrix
|
||||
// conformal to C into which we will accumulate the matrix product:
|
||||
// - the storage precision of C differs from the computation precision;
|
||||
// - the domains are mixed as crr;
|
||||
// - the storage format of C does not match the preferred orientation
|
||||
// of the ccr or crc cases.
|
||||
// Then, after the computation is complete, this matrix will be copied
|
||||
// or accumulated back to C.
|
||||
const bool_t is_ccr_mismatch =
|
||||
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_col_stored( &c_local ) );
|
||||
const bool_t is_crc_mismatch =
|
||||
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
|
||||
!bli_obj_is_row_stored( &c_local ) );
|
||||
|
||||
obj_t ct;
|
||||
bool_t use_ct = FALSE;
|
||||
|
||||
// FGVZ: Consider adding another guard here that only creates and uses a
|
||||
// temporary matrix for accumulation if k < c * kc, where c is some small
|
||||
// constant like 2. And don't forget to use the same conditional for the
|
||||
// castm() and free() at the end.
|
||||
if (
|
||||
bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
|
||||
bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
|
||||
is_ccr_mismatch ||
|
||||
is_crc_mismatch
|
||||
)
|
||||
{
|
||||
use_ct = TRUE;
|
||||
}
|
||||
|
||||
// If we need a temporary matrix conformal to C for whatever reason,
|
||||
// we create it and prepare to use it now.
|
||||
if ( use_ct )
|
||||
{
|
||||
const dim_t m = bli_obj_length( &c_local );
|
||||
const dim_t n = bli_obj_width( &c_local );
|
||||
inc_t rs = bli_obj_row_stride( &c_local );
|
||||
inc_t cs = bli_obj_col_stride( &c_local );
|
||||
|
||||
num_t dt_ct = bli_obj_domain( &c_local ) |
|
||||
bli_obj_comp_prec( &c_local );
|
||||
|
||||
// When performing the crr case, accumulate to a contiguously-stored
|
||||
// real matrix so we do not have to repeatedly update C with general
|
||||
// stride.
|
||||
if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
|
||||
dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
|
||||
|
||||
// When performing the mismatched ccr or crc cases, now is the time
|
||||
// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
|
||||
// microkernel can output directly to C (instead of using a temporary
|
||||
// microtile).
|
||||
if ( is_ccr_mismatch ) { rs = 1; cs = m; }
|
||||
else if ( is_crc_mismatch ) { rs = n; cs = 1; }
|
||||
|
||||
bli_obj_create( dt_ct, m, n, rs, cs, &ct );
|
||||
|
||||
const num_t dt_exec = bli_obj_exec_dt( &c_local );
|
||||
const num_t dt_comp = bli_obj_comp_dt( &c_local );
|
||||
|
||||
bli_obj_set_target_dt( dt_ct, &ct );
|
||||
bli_obj_set_exec_dt( dt_exec, &ct );
|
||||
bli_obj_set_comp_dt( dt_comp, &ct );
|
||||
|
||||
// A naive approach would cast C to the comptuation datatype,
|
||||
// compute with beta, and then cast the result back to the
|
||||
// user-provided output matrix. However, we employ a different
|
||||
// approach that halves the number of memops on C (or its
|
||||
// typecast temporary) by writing the A*B product directly to
|
||||
// temporary storage, and then using xpbym to scale the
|
||||
// output matrix by beta and accumulate/cast the A*B product.
|
||||
//bli_castm( &c_local, &ct );
|
||||
betap = &BLIS_ZERO;
|
||||
|
||||
cp = &ct;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Invoke the internal back-end via the thread handler.
|
||||
bli_l3_thread_decorator
|
||||
(
|
||||
bli_gemm_int,
|
||||
BLIS_GEMMT, // operation family id
|
||||
alpha,
|
||||
&a_local,
|
||||
&b_local,
|
||||
betap,
|
||||
cp,
|
||||
cntx,
|
||||
rntm,
|
||||
cntl
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
|
||||
// If we created a temporary matrix conformal to C for whatever reason,
|
||||
// we copy/accumulate the result back to C and then release the object.
|
||||
if ( use_ct )
|
||||
{
|
||||
obj_t beta_local;
|
||||
|
||||
bli_obj_scalar_detach( &c_local, &beta_local );
|
||||
|
||||
//bli_castnzm( &ct, &c_local );
|
||||
bli_xpbym( &ct, &beta_local, &c_local );
|
||||
|
||||
bli_obj_free( &ct );
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
#if 0
|
||||
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
|
||||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
const bool_t a_is_real = bli_obj_is_real( a );
|
||||
const bool_t a_is_comp = bli_obj_is_complex( a );
|
||||
const bool_t b_is_real = bli_obj_is_real( b );
|
||||
const bool_t b_is_comp = bli_obj_is_complex( b );
|
||||
const bool_t c_is_real = bli_obj_is_real( c );
|
||||
const bool_t c_is_comp = bli_obj_is_complex( c );
|
||||
|
||||
const bool_t a_is_single = bli_obj_is_single_prec( a );
|
||||
const bool_t a_is_double = bli_obj_is_double_prec( a );
|
||||
const bool_t b_is_single = bli_obj_is_single_prec( b );
|
||||
const bool_t b_is_double = bli_obj_is_double_prec( b );
|
||||
const bool_t c_is_single = bli_obj_is_single_prec( c );
|
||||
const bool_t c_is_double = bli_obj_is_double_prec( c );
|
||||
|
||||
const bool_t comp_single = bli_obj_comp_prec( c ) == BLIS_SINGLE_PREC;
|
||||
const bool_t comp_double = bli_obj_comp_prec( c ) == BLIS_DOUBLE_PREC;
|
||||
|
||||
const bool_t mixeddomain = bli_obj_domain( c ) != bli_obj_domain( a ) ||
|
||||
bli_obj_domain( c ) != bli_obj_domain( b );
|
||||
|
||||
( void )a_is_real; ( void )a_is_comp;
|
||||
( void )b_is_real; ( void )b_is_comp;
|
||||
( void )c_is_real; ( void )c_is_comp;
|
||||
( void )a_is_single; ( void )a_is_double;
|
||||
( void )b_is_single; ( void )b_is_double;
|
||||
( void )c_is_single; ( void )c_is_double;
|
||||
( void )comp_single; ( void )comp_double;
|
||||
|
||||
if (
|
||||
//( c_is_comp && a_is_comp && b_is_real ) ||
|
||||
//( c_is_comp && a_is_real && b_is_comp ) ||
|
||||
//( c_is_real && a_is_comp && b_is_comp ) ||
|
||||
//( c_is_comp && a_is_real && b_is_real ) ||
|
||||
//( c_is_real && a_is_comp && b_is_real ) ||
|
||||
//( c_is_real && a_is_real && b_is_comp ) ||
|
||||
//FALSE
|
||||
TRUE
|
||||
)
|
||||
{
|
||||
if (
|
||||
( c_is_single && a_is_single && b_is_single && mixeddomain ) ||
|
||||
( c_is_single && a_is_single && b_is_single && comp_single ) ||
|
||||
( c_is_single && a_is_single && b_is_single && comp_double ) ||
|
||||
( c_is_single && a_is_single && b_is_double ) ||
|
||||
( c_is_single && a_is_double && b_is_single ) ||
|
||||
( c_is_double && a_is_single && b_is_single ) ||
|
||||
( c_is_single && a_is_double && b_is_double ) ||
|
||||
( c_is_double && a_is_single && b_is_double ) ||
|
||||
( c_is_double && a_is_double && b_is_single ) ||
|
||||
( c_is_double && a_is_double && b_is_double && comp_single ) ||
|
||||
( c_is_double && a_is_double && b_is_double && comp_double ) ||
|
||||
( c_is_double && a_is_double && b_is_double && mixeddomain ) ||
|
||||
FALSE
|
||||
)
|
||||
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
|
||||
else
|
||||
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
|
||||
}
|
||||
else
|
||||
bli_gemm_md_zgemm( alpha, a, b, beta, c, cntx, cntl );
|
||||
return;
|
||||
}
|
||||
#else
|
||||
#if 0
|
||||
// If any of the storage datatypes differ, or if the execution precision
|
||||
// differs from the storage precision of C, utilize the mixed datatype
|
||||
// code path.
|
||||
// NOTE: We could check the exec dt against the storage dt of C, but for
|
||||
// now we don't support the caller setting the execution domain
|
||||
// explicitly.
|
||||
if ( bli_obj_dt( a ) != bli_obj_dt( b ) ||
|
||||
bli_obj_dt( a ) != bli_obj_dt( c ) ||
|
||||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
|
||||
{
|
||||
bli_gemm_md_front( alpha, a, b, beta, c, cntx, cntl );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
47
frame/3/gemmt/bli_gemmt_front.h
Normal file
47
frame/3/gemmt/bli_gemmt_front.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
void bli_gemmt_front
|
||||
(
|
||||
obj_t* alpha,
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* beta,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl
|
||||
);
|
||||
|
||||
|
||||
821
frame/3/gemmt/bli_gemmt_ker_var2.c
Normal file
821
frame/3/gemmt/bli_gemmt_ker_var2.c
Normal file
@@ -0,0 +1,821 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#define FUNCPTR_T gemmt_fp
|
||||
|
||||
typedef void (*FUNCPTR_T)
|
||||
(
|
||||
pack_t schema_a,
|
||||
pack_t schema_b,
|
||||
dim_t m_off,
|
||||
dim_t n_off,
|
||||
dim_t m,
|
||||
dim_t n,
|
||||
dim_t k,
|
||||
void* alpha,
|
||||
void* a, inc_t cs_a, inc_t is_a,
|
||||
dim_t pd_a, inc_t ps_a,
|
||||
void* b, inc_t rs_b, inc_t is_b,
|
||||
dim_t pd_b, inc_t ps_b,
|
||||
void* beta,
|
||||
void* c, inc_t rs_c, inc_t cs_c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
thrinfo_t* thread
|
||||
);
|
||||
|
||||
static FUNCPTR_T GENARRAY_T(ftypes);
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname) \
|
||||
\
|
||||
void PASTEMAC(ch, varname) \
|
||||
( \
|
||||
dim_t m_off, \
|
||||
dim_t n_off, \
|
||||
dim_t m_cur, \
|
||||
dim_t n_cur, \
|
||||
ctype* ct, inc_t rs_ct, inc_t cs_ct, \
|
||||
ctype* beta_cast, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
dim_t start, end; \
|
||||
dim_t m, n, diag; \
|
||||
\
|
||||
double beta_val = *beta_cast; \
|
||||
\
|
||||
start = ((n_off < m_off) && (m_off < n_off + n_cur)) ? m_off: n_off; \
|
||||
end = ((n_off < m_off + m_cur) && (m_off + m_cur < n_off + n_cur))? (m_off + m_cur):(n_off + n_cur); \
|
||||
\
|
||||
if( beta_val != 0.0 ) \
|
||||
{ \
|
||||
for(diag = start, m= start-m_off; diag < end; diag++, m++) \
|
||||
for(n = 0; n <= diag-n_off; n++) \
|
||||
c[m*rs_c + n] = c[m * rs_c + n] * beta_val + ct[m*rs_ct + n]; \
|
||||
\
|
||||
for(; m < m_cur; m++) \
|
||||
for(n = 0; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = c[m * rs_c + n] * beta_val + ct[m*rs_ct + n]; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(diag = start, m= start-m_off; diag < end; diag++, m++) \
|
||||
for(n = 0; n <= diag-n_off; n++) \
|
||||
c[m*rs_c + n] = ct[m*rs_ct + n]; \
|
||||
\
|
||||
for(; m < m_cur; m++) \
|
||||
for(n = 0; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = ct[m*rs_ct + n]; \
|
||||
} \
|
||||
\
|
||||
return; \
|
||||
}
|
||||
INSERT_GENTFUNC_BASIC0_SD( update_lower_triang )
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
\
|
||||
void PASTEMAC(ch, varname) \
|
||||
( \
|
||||
dim_t m_off, \
|
||||
dim_t n_off, \
|
||||
dim_t m_cur, \
|
||||
dim_t n_cur, \
|
||||
ctype* ct, inc_t rs_ct, inc_t cs_ct, \
|
||||
ctype* beta_cast, \
|
||||
ctype* c, inc_t rs_c, inc_t cs_c \
|
||||
) \
|
||||
{ \
|
||||
dim_t start, end; \
|
||||
dim_t m, n, diag; \
|
||||
\
|
||||
ctype beta_val = *beta_cast; \
|
||||
\
|
||||
start = ((n_off < m_off) && (m_off < n_off + n_cur)) ? m_off: n_off; \
|
||||
end = ((n_off < m_off + m_cur) && (m_off + m_cur < n_off + n_cur))? (m_off + m_cur):(n_off + n_cur); \
|
||||
\
|
||||
if( beta_val != 0.0 ) \
|
||||
{ \
|
||||
for(m = 0; m < start-m_off; m++) \
|
||||
for(n = 0; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = c[m * rs_c + n] * beta_val + ct[m*rs_ct + n]; \
|
||||
\
|
||||
for(diag = start, m= start-m_off; diag < end; diag++, m++) \
|
||||
for(n = diag-n_off; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = c[m * rs_c + n] * beta_val + ct[m*rs_ct + n]; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
for(m = 0; m < start-m_off; m++) \
|
||||
for(n = 0; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = ct[m*rs_ct + n]; \
|
||||
\
|
||||
for(diag = start, m= start-m_off; diag < end; diag++, m++) \
|
||||
for(n = diag-n_off; n < n_cur; n++) \
|
||||
c[m*rs_c + n] = ct[m*rs_ct + n]; \
|
||||
} \
|
||||
\
|
||||
return; \
|
||||
}
|
||||
INSERT_GENTFUNC_BASIC0_SD( update_upper_triang )
|
||||
|
||||
void bli_gemmt_ker_var2
|
||||
(
|
||||
obj_t* a,
|
||||
obj_t* b,
|
||||
obj_t* c,
|
||||
cntx_t* cntx,
|
||||
rntm_t* rntm,
|
||||
cntl_t* cntl,
|
||||
thrinfo_t* thread
|
||||
)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_6);
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// By now, A and B have been packed and cast to the execution precision.
|
||||
// In most cases, such as when storage precision of C differs from the
|
||||
// execution precision, we utilize the mixed datatype code path. However,
|
||||
// a few cases still fall within this kernel, such as mixed domain with
|
||||
// equal precision (ccr, crc, rcc), hence those expressions being disabled
|
||||
// in the conditional below.
|
||||
if ( //( bli_obj_domain( c ) != bli_obj_domain( a ) ) ||
|
||||
//( bli_obj_domain( c ) != bli_obj_domain( b ) ) ||
|
||||
( bli_obj_dt( c ) != bli_obj_exec_dt( c ) ) )
|
||||
{
|
||||
bli_gemm_ker_var2_md( a, b, c, cntx, rntm, cntl, thread );
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
num_t dt_exec = bli_obj_exec_dt( c );
|
||||
|
||||
pack_t schema_a = bli_obj_pack_schema( a );
|
||||
pack_t schema_b = bli_obj_pack_schema( b );
|
||||
|
||||
dim_t m_off = bli_obj_row_off( c );
|
||||
dim_t n_off = bli_obj_col_off( c );
|
||||
dim_t m = bli_obj_length( c );
|
||||
dim_t n = bli_obj_width( c );
|
||||
dim_t k = bli_obj_width( a );
|
||||
|
||||
void* buf_a = bli_obj_buffer_at_off( a );
|
||||
inc_t cs_a = bli_obj_col_stride( a );
|
||||
inc_t is_a = bli_obj_imag_stride( a );
|
||||
dim_t pd_a = bli_obj_panel_dim( a );
|
||||
inc_t ps_a = bli_obj_panel_stride( a );
|
||||
|
||||
void* buf_b = bli_obj_buffer_at_off( b );
|
||||
inc_t rs_b = bli_obj_row_stride( b );
|
||||
inc_t is_b = bli_obj_imag_stride( b );
|
||||
dim_t pd_b = bli_obj_panel_dim( b );
|
||||
inc_t ps_b = bli_obj_panel_stride( b );
|
||||
|
||||
void* buf_c = bli_obj_buffer_at_off( c );
|
||||
inc_t rs_c = bli_obj_row_stride( c );
|
||||
inc_t cs_c = bli_obj_col_stride( c );
|
||||
|
||||
obj_t scalar_a;
|
||||
obj_t scalar_b;
|
||||
|
||||
void* buf_alpha;
|
||||
void* buf_beta;
|
||||
|
||||
FUNCPTR_T f;
|
||||
|
||||
bool_t uploc;
|
||||
if ( bli_obj_is_lower( c ) )
|
||||
{
|
||||
uploc = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
uploc = 1;
|
||||
}
|
||||
|
||||
// Detach and multiply the scalars attached to A and B.
|
||||
bli_obj_scalar_detach( a, &scalar_a );
|
||||
bli_obj_scalar_detach( b, &scalar_b );
|
||||
bli_mulsc( &scalar_a, &scalar_b );
|
||||
|
||||
// Grab the addresses of the internal scalar buffers for the scalar
|
||||
// merged above and the scalar attached to C.
|
||||
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
|
||||
buf_beta = bli_obj_internal_scalar_buffer( c );
|
||||
|
||||
// If 1m is being employed on a column- or row-stored matrix with a
|
||||
// real-valued beta, we can use the real domain macro-kernel, which
|
||||
// eliminates a little overhead associated with the 1m virtual
|
||||
// micro-kernel.
|
||||
#if 1
|
||||
if ( bli_cntx_method( cntx ) == BLIS_1M )
|
||||
{
|
||||
bli_gemm_ind_recast_1m_params
|
||||
(
|
||||
&dt_exec,
|
||||
schema_a,
|
||||
c,
|
||||
&m, &n, &k,
|
||||
&pd_a, &ps_a,
|
||||
&pd_b, &ps_b,
|
||||
&rs_c, &cs_c
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_GEMM_MD
|
||||
// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
|
||||
bli_gemm_md_ker_var2_recast
|
||||
(
|
||||
&dt_exec,
|
||||
bli_obj_dt( a ),
|
||||
bli_obj_dt( b ),
|
||||
bli_obj_dt( c ),
|
||||
&m, &n, &k,
|
||||
&pd_a, &ps_a,
|
||||
&pd_b, &ps_b,
|
||||
c,
|
||||
&rs_c, &cs_c
|
||||
);
|
||||
#endif
|
||||
|
||||
// Index into the type combination array to extract the correct
|
||||
// function pointer.
|
||||
f = ftypes[dt_exec][uploc];
|
||||
|
||||
// Invoke the function.
|
||||
f( schema_a,
|
||||
schema_b,
|
||||
m_off,
|
||||
n_off,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
buf_alpha,
|
||||
buf_a, cs_a, is_a,
|
||||
pd_a, ps_a,
|
||||
buf_b, rs_b, is_b,
|
||||
pd_b, ps_b,
|
||||
buf_beta,
|
||||
buf_c, rs_c, cs_c,
|
||||
cntx,
|
||||
rntm,
|
||||
thread );
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_6);
|
||||
}
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, uplo, varname ) \
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m_off, \
|
||||
dim_t n_off, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_6); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
if(bli_gemmt_is_strictly_above_diag(m_off, n_off, m, n)) return; \
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
dim_t m_off_cblock, n_off_cblock; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
m_off_cblock = m_off + i * MR; \
|
||||
n_off_cblock = n_off + j * NR; \
|
||||
\
|
||||
if(!bli_gemmt_is_strictly_above_diag(m_off_cblock, n_off_cblock, m_cur, n_cur)) \
|
||||
{ \
|
||||
if(bli_gemmt_is_strictly_below_diag(m_off_cblock, n_off_cblock, m_cur, n_cur)) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,update_lower_triang)( m_off_cblock, n_off_cblock, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_6); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_L_SD( gemmt, ker_var2 )
|
||||
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, opname, uplo, varname ) \
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m_off, \
|
||||
dim_t n_off, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
) \
|
||||
{ \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_6); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
if(bli_gemmt_is_strictly_below_diag(m_off, n_off, m, n)) return; \
|
||||
/* Alias some constants to simpler names. */ \
|
||||
const dim_t MR = pd_a; \
|
||||
const dim_t NR = pd_b; \
|
||||
/*const dim_t PACKMR = cs_a;*/ \
|
||||
/*const dim_t PACKNR = rs_b;*/ \
|
||||
\
|
||||
/* Query the context for the micro-kernel address and cast it to its
|
||||
function pointer type. */ \
|
||||
PASTECH(ch,gemm_ukr_ft) \
|
||||
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
\
|
||||
/* Temporary C buffer for edge cases. Note that the strides of this
|
||||
temporary buffer are set so that they match the storage of the
|
||||
original C matrix. For example, if C is column-stored, ct will be
|
||||
column-stored as well. */ \
|
||||
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
|
||||
/ sizeof( ctype ) ] \
|
||||
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
|
||||
const bool_t col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
|
||||
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
|
||||
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
|
||||
\
|
||||
ctype* restrict zero = PASTEMAC(ch,0); \
|
||||
ctype* restrict a_cast = a; \
|
||||
ctype* restrict b_cast = b; \
|
||||
ctype* restrict c_cast = c; \
|
||||
ctype* restrict alpha_cast = alpha; \
|
||||
ctype* restrict beta_cast = beta; \
|
||||
ctype* restrict b1; \
|
||||
ctype* restrict c1; \
|
||||
\
|
||||
dim_t m_iter, m_left; \
|
||||
dim_t n_iter, n_left; \
|
||||
dim_t i, j; \
|
||||
dim_t m_cur; \
|
||||
dim_t n_cur; \
|
||||
inc_t rstep_a; \
|
||||
inc_t cstep_b; \
|
||||
inc_t rstep_c, cstep_c; \
|
||||
auxinfo_t aux; \
|
||||
\
|
||||
/*
|
||||
Assumptions/assertions:
|
||||
rs_a == 1
|
||||
cs_a == PACKMR
|
||||
pd_a == MR
|
||||
ps_a == stride to next micro-panel of A
|
||||
rs_b == PACKNR
|
||||
cs_b == 1
|
||||
pd_b == NR
|
||||
ps_b == stride to next micro-panel of B
|
||||
rs_c == (no assumptions)
|
||||
cs_c == (no assumptions)
|
||||
*/ \
|
||||
\
|
||||
/* If any dimension is zero, return immediately. */ \
|
||||
if ( bli_zero_dim3( m, n, k ) ) return; \
|
||||
\
|
||||
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
|
||||
PASTEMAC(ch,set0s_mxn)( MR, NR, \
|
||||
ct, rs_ct, cs_ct ); \
|
||||
\
|
||||
/* Compute number of primary and leftover components of the m and n
|
||||
dimensions. */ \
|
||||
n_iter = n / NR; \
|
||||
n_left = n % NR; \
|
||||
\
|
||||
m_iter = m / MR; \
|
||||
m_left = m % MR; \
|
||||
\
|
||||
if ( n_left ) ++n_iter; \
|
||||
if ( m_left ) ++m_iter; \
|
||||
\
|
||||
/* Determine some increments used to step through A, B, and C. */ \
|
||||
rstep_a = ps_a; \
|
||||
\
|
||||
cstep_b = ps_b; \
|
||||
\
|
||||
rstep_c = rs_c * MR; \
|
||||
cstep_c = cs_c * NR; \
|
||||
\
|
||||
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_schema_a( schema_a, &aux ); \
|
||||
bli_auxinfo_set_schema_b( schema_b, &aux ); \
|
||||
\
|
||||
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
|
||||
bli_auxinfo_set_is_a( is_a, &aux ); \
|
||||
bli_auxinfo_set_is_b( is_b, &aux ); \
|
||||
\
|
||||
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
|
||||
loop around the microkernel. Here we query the thrinfo_t node for the
|
||||
1st (ir) loop around the microkernel. */ \
|
||||
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
|
||||
\
|
||||
/* Query the number of threads and thread ids for each loop. */ \
|
||||
dim_t jr_nt = bli_thread_n_way( thread ); \
|
||||
dim_t jr_tid = bli_thread_work_id( thread ); \
|
||||
dim_t ir_nt = bli_thread_n_way( caucus ); \
|
||||
dim_t ir_tid = bli_thread_work_id( caucus ); \
|
||||
\
|
||||
dim_t jr_start, jr_end; \
|
||||
dim_t ir_start, ir_end; \
|
||||
dim_t jr_inc, ir_inc; \
|
||||
\
|
||||
dim_t m_off_cblock, n_off_cblock; \
|
||||
\
|
||||
/* Determine the thread range and increment for the 2nd and 1st loops.
|
||||
NOTE: The definition of bli_thread_range_jrir() will depend on whether
|
||||
slab or round-robin partitioning was requested at configure-time. */ \
|
||||
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
|
||||
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
|
||||
\
|
||||
/* Loop over the n dimension (NR columns at a time). */ \
|
||||
for ( j = jr_start; j < jr_end; j += jr_inc ) \
|
||||
{ \
|
||||
ctype* restrict a1; \
|
||||
ctype* restrict c11; \
|
||||
ctype* restrict b2; \
|
||||
\
|
||||
b1 = b_cast + j * cstep_b; \
|
||||
c1 = c_cast + j * cstep_c; \
|
||||
\
|
||||
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
|
||||
\
|
||||
/* Initialize our next panel of B to be the current panel of B. */ \
|
||||
b2 = b1; \
|
||||
\
|
||||
/* Loop over the m dimension (MR rows at a time). */ \
|
||||
for ( i = ir_start; i < ir_end; i += ir_inc ) \
|
||||
{ \
|
||||
ctype* restrict a2; \
|
||||
\
|
||||
a1 = a_cast + i * rstep_a; \
|
||||
c11 = c1 + i * rstep_c; \
|
||||
\
|
||||
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
|
||||
\
|
||||
/* Compute the addresses of the next panels of A and B. */ \
|
||||
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
|
||||
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \
|
||||
{ \
|
||||
a2 = a_cast; \
|
||||
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
|
||||
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \
|
||||
b2 = b_cast; \
|
||||
} \
|
||||
\
|
||||
/* Save addresses of next panels of A and B to the auxinfo_t
|
||||
object. */ \
|
||||
bli_auxinfo_set_next_a( a2, &aux ); \
|
||||
bli_auxinfo_set_next_b( b2, &aux ); \
|
||||
\
|
||||
m_off_cblock = m_off + i * MR; \
|
||||
n_off_cblock = n_off + j * NR; \
|
||||
\
|
||||
if(!bli_gemmt_is_strictly_below_diag(m_off_cblock, n_off_cblock, m_cur, n_cur)) \
|
||||
{ \
|
||||
if(bli_gemmt_is_strictly_above_diag(m_off_cblock, n_off_cblock, m_cur, n_cur)) \
|
||||
{ \
|
||||
/* Handle interior and edge cases separately. */ \
|
||||
if ( m_cur == MR && n_cur == NR ) \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Invoke the gemm micro-kernel. */ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
gemm_ukr \
|
||||
( \
|
||||
k, \
|
||||
alpha_cast, \
|
||||
a1, \
|
||||
b1, \
|
||||
zero, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
&aux, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
/* Scale the bottom edge of C and add the result from above. */ \
|
||||
PASTEMAC(ch,update_upper_triang)( m_off_cblock, n_off_cblock, \
|
||||
m_cur, n_cur, \
|
||||
ct, rs_ct, cs_ct, \
|
||||
beta_cast, \
|
||||
c11, rs_c, cs_c ); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/*
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
|
||||
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
|
||||
*/ \
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_6); \
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_U_SD( gemmt, ker_var2 )
|
||||
|
||||
81
frame/3/gemmt/bli_gemmt_var.h
Normal file
81
frame/3/gemmt/bli_gemmt_var.h
Normal file
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#undef GENPROT
|
||||
#define GENPROT( opname ) \
|
||||
\
|
||||
void PASTEMAC0(opname) \
|
||||
( \
|
||||
obj_t* a, \
|
||||
obj_t* b, \
|
||||
obj_t* c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
cntl_t* cntl, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
GENPROT( gemmt_ker_var2 )
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-like interfaces with void pointer operands.
|
||||
//
|
||||
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ctype, ch, opname, uplo, varname ) \
|
||||
\
|
||||
void PASTEMACT(ch,opname,uplo,varname) \
|
||||
( \
|
||||
pack_t schema_a, \
|
||||
pack_t schema_b, \
|
||||
dim_t m_off, \
|
||||
dim_t n_off, \
|
||||
dim_t m, \
|
||||
dim_t n, \
|
||||
dim_t k, \
|
||||
void* alpha, \
|
||||
void* a, inc_t cs_a, inc_t is_a, \
|
||||
dim_t pd_a, inc_t ps_a, \
|
||||
void* b, inc_t rs_b, inc_t is_b, \
|
||||
dim_t pd_b, inc_t ps_b, \
|
||||
void* beta, \
|
||||
void* c, inc_t rs_c, inc_t cs_c, \
|
||||
cntx_t* cntx, \
|
||||
rntm_t* rntm, \
|
||||
thrinfo_t* thread \
|
||||
);
|
||||
|
||||
INSERT_GENTPROT_GEMMT_SD( gemmt, ker_var2 )
|
||||
235
frame/compat/bla_gemmt.c
Normal file
235
frame/compat/bla_gemmt.c
Normal file
@@ -0,0 +1,235 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
|
||||
//
|
||||
// Define BLAS-to-BLIS interfaces.
|
||||
//
|
||||
|
||||
#ifdef BLIS_BLAS3_CALLS_TAPI
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
uplo_t blis_uploc, \
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
dim_t n0, k0; \
|
||||
inc_t rs_a, cs_a; \
|
||||
inc_t rs_b, cs_b; \
|
||||
inc_t rs_c, cs_c; \
|
||||
\
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1); \
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
uploc, \
|
||||
transa, \
|
||||
transb, \
|
||||
n, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
rs_a = 1; \
|
||||
cs_a = *lda; \
|
||||
rs_b = 1; \
|
||||
cs_b = *ldb; \
|
||||
rs_c = 1; \
|
||||
cs_c = *ldc; \
|
||||
\
|
||||
if(!( n )) \
|
||||
return; \
|
||||
/* Call BLIS interface. */ \
|
||||
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
|
||||
( \
|
||||
blis_uploc, \
|
||||
blis_transa, \
|
||||
blis_transb, \
|
||||
n0, \
|
||||
k0, \
|
||||
(ftype*)alpha, \
|
||||
(ftype*)a, rs_a, cs_a, \
|
||||
(ftype*)b, rs_b, cs_b, \
|
||||
(ftype*)beta, \
|
||||
(ftype*)c, rs_c, cs_c, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ftype, ch, blasname, blisname ) \
|
||||
\
|
||||
void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
) \
|
||||
{ \
|
||||
trans_t blis_transa; \
|
||||
trans_t blis_transb; \
|
||||
uplo_t blis_uploc; \
|
||||
\
|
||||
dim_t n0, k0; \
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO) \
|
||||
\
|
||||
/* Initialize BLIS. */ \
|
||||
bli_init_auto(); \
|
||||
\
|
||||
/* Perform BLAS parameter checking. */ \
|
||||
PASTEBLACHK(blasname) \
|
||||
( \
|
||||
MKSTR(ch), \
|
||||
MKSTR(blasname), \
|
||||
uploc, \
|
||||
transa, \
|
||||
transb, \
|
||||
n, \
|
||||
k, \
|
||||
lda, \
|
||||
ldb, \
|
||||
ldc \
|
||||
); \
|
||||
\
|
||||
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
|
||||
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
|
||||
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
|
||||
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
|
||||
\
|
||||
/* Typecast BLAS integers to BLIS integers. */ \
|
||||
bli_convert_blas_dim1( *n, n0 ); \
|
||||
bli_convert_blas_dim1( *k, k0 ); \
|
||||
\
|
||||
/* Set the row and column strides of the matrix operands. */ \
|
||||
const inc_t rs_a = 1; \
|
||||
const inc_t cs_a = *lda; \
|
||||
const inc_t rs_b = 1; \
|
||||
const inc_t cs_b = *ldb; \
|
||||
const inc_t rs_c = 1; \
|
||||
const inc_t cs_c = *ldc; \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t ao = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t bo = BLIS_OBJECT_INITIALIZER; \
|
||||
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
|
||||
obj_t co = BLIS_OBJECT_INITIALIZER; \
|
||||
\
|
||||
dim_t m0_a, n0_a; \
|
||||
dim_t m0_b, n0_b; \
|
||||
\
|
||||
bli_set_dims_with_trans( blis_transa, n0, k0, &m0_a, &n0_a ); \
|
||||
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
|
||||
\
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
|
||||
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
|
||||
\
|
||||
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
|
||||
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
|
||||
bli_obj_init_finish( dt, n0, n0, (ftype*)c, rs_c, cs_c, &co ); \
|
||||
\
|
||||
bli_obj_set_conjtrans( blis_transa, &ao ); \
|
||||
bli_obj_set_conjtrans( blis_transb, &bo ); \
|
||||
bli_obj_set_uplo( blis_uploc, &co ); \
|
||||
\
|
||||
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
|
||||
( \
|
||||
&alphao, \
|
||||
&ao, \
|
||||
&bo, \
|
||||
&betao, \
|
||||
&co, \
|
||||
NULL, \
|
||||
NULL \
|
||||
); \
|
||||
\
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
|
||||
/* Finalize BLIS. */ \
|
||||
bli_finalize_auto(); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
INSERT_GENTFUNC_BLAS( gemmt, gemmt )
|
||||
#endif
|
||||
58
frame/compat/bla_gemmt.h
Normal file
58
frame/compat/bla_gemmt.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// Prototype BLAS-to-BLIS interfaces.
|
||||
//
|
||||
#undef GENTPROT
|
||||
#define GENTPROT( ftype, ch, blasname ) \
|
||||
\
|
||||
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
|
||||
( \
|
||||
const f77_char* uploc, \
|
||||
const f77_char* transa, \
|
||||
const f77_char* transb, \
|
||||
const f77_int* n, \
|
||||
const f77_int* k, \
|
||||
const ftype* alpha, \
|
||||
const ftype* a, const f77_int* lda, \
|
||||
const ftype* b, const f77_int* ldb, \
|
||||
const ftype* beta, \
|
||||
ftype* c, const f77_int* ldc \
|
||||
);
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
INSERT_GENTPROT_BLAS( gemmt )
|
||||
#endif
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -185,6 +186,7 @@
|
||||
#include "bla_syr2k.h"
|
||||
#include "bla_trmm.h"
|
||||
#include "bla_trsm.h"
|
||||
#include "bla_gemmt.h"
|
||||
|
||||
#include "bla_gemm_check.h"
|
||||
#include "bla_hemm_check.h"
|
||||
@@ -195,6 +197,7 @@
|
||||
#include "bla_syr2k_check.h"
|
||||
#include "bla_trmm_check.h"
|
||||
#include "bla_trsm_check.h"
|
||||
#include "bla_gemmt_check.h"
|
||||
|
||||
// -- Fortran-compatible APIs to BLIS functions --
|
||||
|
||||
|
||||
@@ -448,6 +448,11 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
float alpha, const float *A, f77_int lda,
|
||||
float *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, float alpha, const float *A,
|
||||
f77_int lda, const float *B, f77_int ldb,
|
||||
float beta, float *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -478,6 +483,11 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
double alpha, const double *A, f77_int lda,
|
||||
double *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, double alpha, const double *A,
|
||||
f77_int lda, const double *B, f77_int ldb,
|
||||
double beta, double *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -508,6 +518,11 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
const void *alpha, const void *A, f77_int lda,
|
||||
void *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc);
|
||||
|
||||
void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
|
||||
@@ -538,6 +553,11 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
|
||||
enum CBLAS_DIAG Diag, f77_int M, f77_int N,
|
||||
const void *alpha, const void *A, f77_int lda,
|
||||
void *B, f77_int ldb);
|
||||
void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K, const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc);
|
||||
|
||||
|
||||
/*
|
||||
|
||||
130
frame/compat/cblas/src/cblas_cgemmt.c
Normal file
130
frame/compat/cblas/src/cblas_cgemmt.c
Normal file
@@ -0,0 +1,130 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
*
|
||||
* cblas_cgemmt.c
|
||||
* This program is a C interface to cgemmt.
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K,
|
||||
const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc)
|
||||
{
|
||||
char TA, TB, UL;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB, F77_UL;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#define F77_UL &UL
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
if( Uplo == CblasUpper ) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A,
|
||||
&F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper ) UL='U';
|
||||
else if( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)B,
|
||||
&F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
135
frame/compat/cblas/src/cblas_dgemmt.c
Normal file
135
frame/compat/cblas/src/cblas_dgemmt.c
Normal file
@@ -0,0 +1,135 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
*
|
||||
* cblas_dgemmt.c
|
||||
* This program is a C interface to dgemmt.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_dgemmt( enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K,
|
||||
double alpha, const double *A,
|
||||
f77_int lda, const double *B, f77_int ldb,
|
||||
double beta, double *C, f77_int ldc)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
char TA, TB, UL;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB, F77_UL;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#define F77_UL &UL
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
if( Uplo == CblasUpper) UL = 'U';
|
||||
else if(Uplo == CblasLower) UL = 'L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
#endif
|
||||
|
||||
F77_dgemmt(F77_UL,F77_TA, F77_TB, &F77_N, &F77_K, &alpha, A,
|
||||
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if(Uplo == CblasUpper) UL = 'L';
|
||||
else if(Uplo == CblasLower) UL = 'U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
#endif
|
||||
|
||||
F77_dgemmt(F77_UL,F77_TA, F77_TB, &F77_N, &F77_K, &alpha, B,
|
||||
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
@@ -6,6 +6,9 @@
|
||||
* Merged cblas_f77.h and cblas_fortran_header.h
|
||||
*
|
||||
* (Heavily hacked down from the original)
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef CBLAS_F77_H
|
||||
@@ -163,5 +166,9 @@
|
||||
#define F77_zsyr2k zsyr2k_
|
||||
#define F77_ztrmm ztrmm_
|
||||
#define F77_ztrsm ztrsm_
|
||||
#define F77_dgemmt dgemmt_
|
||||
#define F77_sgemmt sgemmt_
|
||||
#define F77_cgemmt cgemmt_
|
||||
#define F77_zgemmt zgemmt_
|
||||
|
||||
#endif /* CBLAS_F77_H */
|
||||
|
||||
135
frame/compat/cblas/src/cblas_sgemmt.c
Normal file
135
frame/compat/cblas/src/cblas_sgemmt.c
Normal file
@@ -0,0 +1,135 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
*
|
||||
* cblas_sgemmt.c
|
||||
* This program is a C interface to sgemmt.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_sgemmt( enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K,
|
||||
float alpha, const float *A,
|
||||
f77_int lda, const float *B, f77_int ldb,
|
||||
float beta, float *C, f77_int ldc)
|
||||
{
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
|
||||
char TA, TB, UL;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB, F77_UL;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#define F77_UL &UL
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
if( Uplo == CblasUpper) UL = 'U';
|
||||
else if(Uplo == CblasLower) UL = 'L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
#endif
|
||||
|
||||
F77_sgemmt(F77_UL,F77_TA, F77_TB, &F77_N, &F77_K, &alpha, A,
|
||||
&F77_lda, B, &F77_ldb, &beta, C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if(Uplo == CblasUpper) UL = 'L';
|
||||
else if(Uplo == CblasLower) UL = 'U';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
}
|
||||
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
F77_UL = C2F_CHAR(&UL);
|
||||
#endif
|
||||
|
||||
F77_sgemmt(F77_UL,F77_TA, F77_TB, &F77_N, &F77_K, &alpha, B,
|
||||
&F77_ldb, A, &F77_lda, &beta, C, &F77_ldc);
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
131
frame/compat/cblas/src/cblas_zgemmt.c
Normal file
131
frame/compat/cblas/src/cblas_zgemmt.c
Normal file
@@ -0,0 +1,131 @@
|
||||
#include "blis.h"
|
||||
#ifdef BLIS_ENABLE_CBLAS
|
||||
/*
|
||||
*
|
||||
* cblas_zgemmt.c
|
||||
* This program is a C interface to zgemmt.
|
||||
*
|
||||
* Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "cblas.h"
|
||||
#include "cblas_f77.h"
|
||||
void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA,enum CBLAS_TRANSPOSE TransB,
|
||||
f77_int N, f77_int K,
|
||||
const void *alpha, const void *A,
|
||||
f77_int lda, const void *B, f77_int ldb,
|
||||
const void *beta, void *C, f77_int ldc)
|
||||
{
|
||||
char TA, TB, UL;
|
||||
#ifdef F77_CHAR
|
||||
F77_CHAR F77_TA, F77_TB, F77_UL;
|
||||
#else
|
||||
#define F77_TA &TA
|
||||
#define F77_TB &TB
|
||||
#define F77_UL &UL
|
||||
#endif
|
||||
|
||||
#ifdef F77_INT
|
||||
F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb;
|
||||
F77_INT F77_ldc=ldc;
|
||||
#else
|
||||
#define F77_N N
|
||||
#define F77_K K
|
||||
#define F77_lda lda
|
||||
#define F77_ldb ldb
|
||||
#define F77_ldc ldc
|
||||
#endif
|
||||
|
||||
extern int CBLAS_CallFromC;
|
||||
extern int RowMajorStrg;
|
||||
RowMajorStrg = 0;
|
||||
CBLAS_CallFromC = 1;
|
||||
|
||||
if( Order == CblasColMajor )
|
||||
{
|
||||
if( Uplo == CblasUpper ) UL='U';
|
||||
else if ( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransA == CblasTrans) TA='T';
|
||||
else if ( TransA == CblasConjTrans ) TA='C';
|
||||
else if ( TransA == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if(TransB == CblasTrans) TB='T';
|
||||
else if ( TransB == CblasConjTrans ) TB='C';
|
||||
else if ( TransB == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A,
|
||||
&F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
|
||||
} else if (Order == CblasRowMajor)
|
||||
{
|
||||
RowMajorStrg = 1;
|
||||
if( Uplo == CblasUpper ) UL='U';
|
||||
else if( Uplo == CblasLower ) UL='L';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransA == CblasTrans) TB='T';
|
||||
else if ( TransA == CblasConjTrans ) TB='C';
|
||||
else if ( TransA == CblasNoTrans ) TB='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
if(TransB == CblasTrans) TA='T';
|
||||
else if ( TransB == CblasConjTrans ) TA='C';
|
||||
else if ( TransB == CblasNoTrans ) TA='N';
|
||||
else
|
||||
{
|
||||
cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#ifdef F77_CHAR
|
||||
F77_TA = C2F_CHAR(&TA);
|
||||
F77_TB = C2F_CHAR(&TB);
|
||||
#endif
|
||||
|
||||
F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)B,
|
||||
&F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc);
|
||||
}
|
||||
else cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order);
|
||||
CBLAS_CallFromC = 0;
|
||||
RowMajorStrg = 0;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
92
frame/compat/check/bla_gemmt_check.h
Normal file
92
frame/compat/check/bla_gemmt_check.h
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name(s) of the copyright holder(s) nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS
|
||||
|
||||
#define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, n, k, lda, ldb, ldc ) \
|
||||
{ \
|
||||
f77_int info = 0; \
|
||||
f77_int nota, notb; \
|
||||
f77_int conja, conjb; \
|
||||
f77_int ta, tb; \
|
||||
f77_int lower, upper; \
|
||||
f77_int nrowa, nrowb; \
|
||||
\
|
||||
nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
|
||||
conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
|
||||
ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
|
||||
\
|
||||
lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \
|
||||
upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \
|
||||
\
|
||||
if ( nota ) { nrowa = *n; } \
|
||||
else { nrowa = *k; } \
|
||||
if ( notb ) { nrowb = *k; } \
|
||||
else { nrowb = *n; } \
|
||||
\
|
||||
if ( !lower && !upper ) \
|
||||
info = 1; \
|
||||
else if ( !nota && !conja && !ta ) \
|
||||
info = 2; \
|
||||
else if ( !notb && !conjb && !tb ) \
|
||||
info = 3; \
|
||||
else if ( *n < 0 ) \
|
||||
info = 4; \
|
||||
else if ( *k < 0 ) \
|
||||
info = 5; \
|
||||
else if ( *lda < bli_max( 1, nrowa ) ) \
|
||||
info = 8; \
|
||||
else if ( *ldb < bli_max( 1, nrowb ) ) \
|
||||
info = 10; \
|
||||
else if ( *ldc < bli_max( 1, *n ) ) \
|
||||
info = 13; \
|
||||
\
|
||||
if ( info != 0 ) \
|
||||
{ \
|
||||
char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \
|
||||
\
|
||||
sprintf( func_str, "%s%-5s", dt_str, op_str ); \
|
||||
\
|
||||
bli_string_mkupper( func_str ); \
|
||||
\
|
||||
PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
|
||||
\
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -103,6 +104,18 @@ arrayname[BLIS_NUM_FP_TYPES] = \
|
||||
PASTEMAC(z,op) \
|
||||
}
|
||||
|
||||
|
||||
#define GENARRAY_T(arrayname) \
|
||||
\
|
||||
arrayname[BLIS_NUM_FP_TYPES][2] = \
|
||||
{ \
|
||||
{PASTEMACT(s,gemmt,l,ker_var2), PASTEMACT(s,gemmt,u,ker_var2)}, \
|
||||
{NULL,NULL}, \
|
||||
{PASTEMACT(d,gemmt,l,ker_var2), PASTEMACT(d,gemmt,u,ker_var2)}, \
|
||||
{NULL,NULL}, \
|
||||
}
|
||||
|
||||
|
||||
#define GENARRAY_I(arrayname,op) \
|
||||
\
|
||||
arrayname[BLIS_NUM_FP_TYPES+1] = \
|
||||
|
||||
@@ -142,6 +142,16 @@ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \
|
||||
GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \
|
||||
GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname )
|
||||
|
||||
// --GEMMT specific kernels ----------------------------------------------------
|
||||
#define INSERT_GENTFUNC_L_SD( opname, funcname ) \
|
||||
\
|
||||
GENTFUNC(float, s, opname, l, funcname) \
|
||||
GENTFUNC(double, d, opname, l, funcname)
|
||||
|
||||
#define INSERT_GENTFUNC_U_SD( opname, funcname ) \
|
||||
\
|
||||
GENTFUNC(float, s, opname, u, funcname) \
|
||||
GENTFUNC(double, d, opname, u, funcname)
|
||||
|
||||
// -- Macros for functions with one operand ------------------------------------
|
||||
|
||||
@@ -158,6 +168,12 @@ GENTFUNC( scomplex, c, tfuncname ) \
|
||||
GENTFUNC( dcomplex, z, tfuncname )
|
||||
|
||||
|
||||
#define INSERT_GENTFUNC_BASIC0_SD( tfuncname ) \
|
||||
\
|
||||
GENTFUNC( float, s, tfuncname ) \
|
||||
GENTFUNC( double, d, tfuncname )
|
||||
|
||||
|
||||
#define INSERT_GENTFUNC_BASIC0_CZ( tfuncname ) \
|
||||
\
|
||||
GENTFUNC( scomplex, c, tfuncname ) \
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -121,7 +122,13 @@ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \
|
||||
GENTPROTSCAL( float, scomplex, s, c, blasname ) \
|
||||
GENTPROTSCAL( double, dcomplex, d, z, blasname )
|
||||
|
||||
|
||||
// -- GEMMT specific function --------------------------------------------------
|
||||
#define INSERT_GENTPROT_GEMMT_SD(opname, funcname) \
|
||||
\
|
||||
GENTPROT( float, s, gemmt, l, funcname ) \
|
||||
GENTPROT( double, d, gemmt, l, funcname ) \
|
||||
GENTPROT( float, s, gemmt, u, funcname ) \
|
||||
GENTPROT( double, d, gemmt, u, funcname )
|
||||
|
||||
|
||||
// -- Macros for functions with one operand ------------------------------------
|
||||
@@ -138,6 +145,8 @@ GENTPROT( double, d, tfuncname ) \
|
||||
GENTPROT( scomplex, c, tfuncname ) \
|
||||
GENTPROT( dcomplex, z, tfuncname )
|
||||
|
||||
|
||||
|
||||
// -- (one auxiliary argument) --
|
||||
|
||||
#define INSERT_GENTPROT_BASIC( tfuncname, varname ) \
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -155,6 +155,7 @@
|
||||
#define MKSTR(s1) #s1
|
||||
#define STRINGIFY_INT( s ) MKSTR( s )
|
||||
|
||||
#define PASTEMACT(ch1, ch2, ch3, ch4) bli_ ## ch1 ## ch2 ## _ ## ch3 ## _ ## ch4
|
||||
// Fortran-77 name-mangling macros.
|
||||
#define PASTEF770(name) name ## _
|
||||
#define PASTEF77(ch1,name) ch1 ## name ## _
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -581,7 +581,19 @@ static bool_t bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 )
|
||||
( s1 != 1 || s2 != 1 || s3 != 1 );
|
||||
}
|
||||
|
||||
// offset-relate
|
||||
|
||||
static bool_t bli_gemmt_is_strictly_below_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
( ( n_off + n - 1 ) < m_off );
|
||||
}
|
||||
|
||||
static bool_t bli_gemmt_is_strictly_above_diag( dim_t m_off, dim_t n_off, dim_t m, dim_t n )
|
||||
{
|
||||
return ( bool_t )
|
||||
( ( m_off + m - 1 ) < n_off );
|
||||
}
|
||||
// diag offset-related
|
||||
|
||||
static void bli_negate_diag_offset( doff_t* diagoff )
|
||||
|
||||
@@ -926,11 +926,11 @@ typedef enum
|
||||
BLIS_TRMM3,
|
||||
BLIS_TRMM,
|
||||
BLIS_TRSM,
|
||||
|
||||
BLIS_GEMMT,
|
||||
BLIS_NOID
|
||||
} opid_t;
|
||||
|
||||
#define BLIS_NUM_LEVEL3_OPS 10
|
||||
#define BLIS_NUM_LEVEL3_OPS 11
|
||||
|
||||
|
||||
// -- Blocksize ID type --
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -37,21 +37,21 @@
|
||||
|
||||
static void_fp bli_l3_ind_oper_fp[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
|
||||
{
|
||||
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm */
|
||||
/* gemm hemm herk her2k symm syrk, syr2k trmm3 trmm trsm gemmt*/
|
||||
/* 3mh */ { bli_gemm3mh, bli_hemm3mh, bli_herk3mh, bli_her2k3mh, bli_symm3mh,
|
||||
bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL },
|
||||
bli_syrk3mh, bli_syr2k3mh, bli_trmm33mh, NULL, NULL , NULL },
|
||||
/* 3m1 */ { bli_gemm3m1, bli_hemm3m1, bli_herk3m1, bli_her2k3m1, bli_symm3m1,
|
||||
bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 },
|
||||
bli_syrk3m1, bli_syr2k3m1, bli_trmm33m1, bli_trmm3m1, bli_trsm3m1 , NULL },
|
||||
/* 4mh */ { bli_gemm4mh, bli_hemm4mh, bli_herk4mh, bli_her2k4mh, bli_symm4mh,
|
||||
bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL },
|
||||
bli_syrk4mh, bli_syr2k4mh, bli_trmm34mh, NULL, NULL , NULL },
|
||||
/* 4mb */ { bli_gemm4mb, NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL, NULL },
|
||||
NULL, NULL, NULL, NULL, NULL , NULL },
|
||||
/* 4m1 */ { bli_gemm4m1, bli_hemm4m1, bli_herk4m1, bli_her2k4m1, bli_symm4m1,
|
||||
bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 },
|
||||
bli_syrk4m1, bli_syr2k4m1, bli_trmm34m1, bli_trmm4m1, bli_trsm4m1 , NULL },
|
||||
/* 1m */ { bli_gemm1m, bli_hemm1m, bli_herk1m, bli_her2k1m, bli_symm1m,
|
||||
bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m },
|
||||
bli_syrk1m, bli_syr2k1m, bli_trmm31m, bli_trmm1m, bli_trsm1m , NULL },
|
||||
/* nat */ { bli_gemmnat, bli_hemmnat, bli_herknat, bli_her2knat, bli_symmnat,
|
||||
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat },
|
||||
bli_syrknat, bli_syr2knat, bli_trmm3nat, bli_trmmnat, bli_trsmnat , bli_gemmtnat },
|
||||
};
|
||||
|
||||
//
|
||||
@@ -99,6 +99,7 @@ bool_t PASTEMAC(opname,ind_has_avail)( num_t dt )
|
||||
*/
|
||||
|
||||
GENFUNC( gemm, BLIS_GEMM )
|
||||
GENFUNC( gemmt, BLIS_GEMMT )
|
||||
GENFUNC( hemm, BLIS_HEMM )
|
||||
GENFUNC( herk, BLIS_HERK )
|
||||
GENFUNC( her2k, BLIS_HER2K )
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -44,6 +45,7 @@ void_fp PASTEMAC(opname,ind_get_avail)( num_t dt );
|
||||
/*bool_t PASTEMAC(opname,ind_has_avail)( num_t dt ); */
|
||||
|
||||
GENPROT( gemm )
|
||||
GENPROT( gemmt )
|
||||
GENPROT( hemm )
|
||||
GENPROT( herk )
|
||||
GENPROT( her2k )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -67,6 +67,7 @@ void PASTEMAC(opname,imeth) \
|
||||
}
|
||||
|
||||
GENFRONT( gemm, ind )
|
||||
GENFRONT( gemmt, ind )
|
||||
GENFRONT( her2k, ind )
|
||||
GENFRONT( syr2k, ind )
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -49,7 +50,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(syrk,imeth) ( obj_t* alpha, obj_t* a
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(syr2k,imeth)( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(trmm3,imeth)( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(trmm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(trsm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm );
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(trsm,imeth) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ); \
|
||||
BLIS_EXPORT_BLIS void PASTEMAC(gemmt,imeth) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); \
|
||||
|
||||
GENPROT( nat )
|
||||
GENPROT( ind )
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
|
||||
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -81,6 +81,7 @@ void PASTEMAC(opname,imeth) \
|
||||
// defined in the sandbox environment.
|
||||
#ifndef BLIS_ENABLE_SANDBOX
|
||||
GENFRONT( gemm, gemm, nat )
|
||||
GENFRONT( gemmt, gemm, nat )
|
||||
#endif
|
||||
GENFRONT( her2k, gemm, nat )
|
||||
GENFRONT( syr2k, gemm, nat )
|
||||
|
||||
@@ -180,6 +180,7 @@ blis: \
|
||||
test_scalv_blis.x \
|
||||
\
|
||||
test_gemm_blis.x \
|
||||
test_gemmt_blis.x \
|
||||
test_hemm_blis.x \
|
||||
test_herk_blis.x \
|
||||
test_her2k_blis.x \
|
||||
@@ -242,6 +243,7 @@ mkl: test_dotv_mkl.x \
|
||||
test_scalv_mkl.x \
|
||||
\
|
||||
test_gemm_mkl.x \
|
||||
test_gemmt_mkl.x \
|
||||
test_hemm_mkl.x \
|
||||
test_herk_mkl.x \
|
||||
test_her2k_mkl.x \
|
||||
|
||||
529
test/test_gemmt.c
Normal file
529
test/test_gemmt.c
Normal file
@@ -0,0 +1,529 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2020, Advanced Micro Devices, Inc.
|
||||
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include "blis.h"
|
||||
#include "cblas.h"
|
||||
|
||||
//#define FILE_IN_OUT
|
||||
//#define CBLAS
|
||||
//#define PRINT
|
||||
#define MATRIX_INITIALISATION
|
||||
int main( int argc, char** argv )
|
||||
{
|
||||
obj_t a, b, c;
|
||||
obj_t c_save;
|
||||
obj_t alpha, beta;
|
||||
dim_t n, k;
|
||||
num_t dt;
|
||||
int r, n_repeats;
|
||||
trans_t transa;
|
||||
trans_t transb;
|
||||
uplo_t uploc;
|
||||
#ifndef FILE_IN_OUT
|
||||
dim_t p;
|
||||
dim_t p_begin, p_end, p_inc;
|
||||
int n_input, k_input;
|
||||
#endif
|
||||
|
||||
double dtime;
|
||||
double dtime_save;
|
||||
double gflops;
|
||||
#ifdef FILE_IN_OUT
|
||||
FILE* fin = NULL;
|
||||
FILE* fout = NULL;
|
||||
|
||||
#endif
|
||||
//bli_init();
|
||||
|
||||
//bli_error_checking_level_set( BLIS_NO_ERROR_CHECKING );
|
||||
|
||||
n_repeats = 3;
|
||||
|
||||
#ifndef FILE_IN_OUT
|
||||
#ifndef PRINT
|
||||
p_begin = 48;
|
||||
p_end = 10000;
|
||||
p_inc = 192;
|
||||
|
||||
n_input = -1;
|
||||
k_input = -1;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
p_inc = 1;
|
||||
|
||||
k_input = 50;
|
||||
n_input = 50;
|
||||
#endif
|
||||
#endif
|
||||
#if 1
|
||||
//dt = BLIS_FLOAT;
|
||||
dt = BLIS_DOUBLE;
|
||||
#else
|
||||
//dt = BLIS_SCOMPLEX;
|
||||
dt = BLIS_DCOMPLEX;
|
||||
#endif
|
||||
|
||||
transa = BLIS_NO_TRANSPOSE;
|
||||
transb = BLIS_NO_TRANSPOSE;
|
||||
|
||||
uploc = BLIS_UPPER;
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
if (argc < 3)
|
||||
{
|
||||
printf("Usage: ./test_gemmt_XX.x input.csv output.csv\n");
|
||||
exit(1);
|
||||
}
|
||||
fin = fopen(argv[1], "r");
|
||||
if (fin == NULL)
|
||||
{
|
||||
printf("Error opening the file %s\n", argv[1]);
|
||||
exit(1);
|
||||
}
|
||||
fout = fopen(argv[2], "w");
|
||||
if (fout == NULL)
|
||||
{
|
||||
printf("Error opening output file %s\n", argv[2]);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(fout, "n\t k\t lda\t ldb\t ldc\t gflops\n");
|
||||
|
||||
|
||||
printf("~~~~~~~~~~_BLAS\t n\t k\t lda\t ldb\t ldc \t gflops\n");
|
||||
|
||||
inc_t cs_a;
|
||||
inc_t cs_b;
|
||||
inc_t cs_c;
|
||||
|
||||
while (fscanf(fin, "%ld %ld %ld %ld %ld\n", &k, &n, &cs_a, &cs_b, &cs_c) == 5)
|
||||
{
|
||||
if ((n > cs_a) || (k > cs_b) || (n > cs_c)) continue; // leading dimension should be greater than number of rows
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha);
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
|
||||
bli_obj_create( dt, n, k, 1, cs_a, &a );
|
||||
bli_obj_create( dt, k, n, 1, cs_b, &b );
|
||||
bli_obj_create( dt, n, n, 1, cs_c, &c );
|
||||
bli_obj_create( dt, n, n, 1, cs_c, &c_save );
|
||||
#ifdef MATRIX_INITIALISATION
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
#endif
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, &c );
|
||||
bli_obj_set_uplo( uploc, &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a);
|
||||
bli_obj_set_conjtrans( transb, &b);
|
||||
|
||||
//Randomize C and zero the unstored triangle to ensure the
|
||||
//implementation reads only from the stored region.
|
||||
bli_randm( &c );
|
||||
bli_mktrim( &c );
|
||||
|
||||
//bli_setsc( 0.0, -1, &alpha );
|
||||
//bli_setsc( 0.0, 1, &beta );
|
||||
|
||||
bli_setsc( 1, 0.0, &alpha );
|
||||
bli_setsc( 1, 0.0, &beta );
|
||||
|
||||
#else
|
||||
for ( p = p_begin; p <= p_end; p += p_inc )
|
||||
{
|
||||
if ( n_input < 0 ) n = p * ( dim_t )abs(n_input);
|
||||
else n = ( dim_t ) n_input;
|
||||
if ( k_input < 0 ) k = p * ( dim_t )abs(k_input);
|
||||
else k = ( dim_t ) k_input;
|
||||
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &alpha );
|
||||
bli_obj_create( dt, 1, 1, 0, 0, &beta );
|
||||
#ifdef CBLAS
|
||||
bli_obj_create( dt, n, k, k, 1, &a );
|
||||
bli_obj_create( dt, k, n, n, 1, &b );
|
||||
bli_obj_create( dt, n, n, n, 1, &c );
|
||||
bli_obj_create( dt, n, n, n, 1, &c_save );
|
||||
#else
|
||||
bli_obj_create( dt, n, k, 1, n, &a );
|
||||
bli_obj_create( dt, k, n, 1, k, &b );
|
||||
bli_obj_create( dt, n, n, 1, n, &c );
|
||||
bli_obj_create( dt, n, n, 1, n, &c_save );
|
||||
|
||||
#endif
|
||||
|
||||
bli_randm( &a );
|
||||
bli_randm( &b );
|
||||
bli_randm( &c );
|
||||
bli_obj_set_struc( BLIS_TRIANGULAR, &c );
|
||||
bli_obj_set_uplo( uploc, &c );
|
||||
|
||||
bli_obj_set_conjtrans( transa, &a );
|
||||
bli_obj_set_conjtrans( transb, &b );
|
||||
|
||||
//Randomize C and zero the unstored triangle to ensure the
|
||||
//implementation reads only from the stored region.
|
||||
|
||||
bli_randm( &c );
|
||||
bli_mktrim( &c );
|
||||
|
||||
bli_setsc( (0.9/1.0), 0.2, &alpha );
|
||||
bli_setsc( -(1.1/1.0), 0.3, &beta );
|
||||
|
||||
#endif
|
||||
bli_copym( &c, &c_save );
|
||||
|
||||
dtime_save = DBL_MAX;
|
||||
|
||||
for ( r = 0; r < n_repeats; ++r )
|
||||
{
|
||||
bli_copym( &c_save, &c );
|
||||
|
||||
|
||||
dtime = bli_clock();
|
||||
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "a", &a, "%4.1f", "," );
|
||||
bli_printm( "b", &b, "%4.1f", "," );
|
||||
bli_printm( "c", &c, "%4.1f", "," );
|
||||
#endif
|
||||
|
||||
#ifdef BLIS
|
||||
bli_gemmt( &alpha,
|
||||
&a,
|
||||
&b,
|
||||
&beta,
|
||||
&c );
|
||||
|
||||
#else
|
||||
|
||||
#ifdef CBLAS
|
||||
enum CBLAS_ORDER cblas_order;
|
||||
enum CBLAS_UPLO cblas_uplo;
|
||||
enum CBLAS_TRANSPOSE cblas_transa;
|
||||
enum CBLAS_TRANSPOSE cblas_transb;
|
||||
|
||||
if ( bli_obj_row_stride( &c ) == 1 )
|
||||
cblas_order = CblasColMajor;
|
||||
else
|
||||
cblas_order = CblasRowMajor;
|
||||
if( bli_is_upper( uploc ) )
|
||||
cblas_uplo = CblasUpper;
|
||||
else
|
||||
cblas_uplo = CblasLower;
|
||||
|
||||
if( bli_is_trans( transa ) )
|
||||
cblas_transa = CblasTrans;
|
||||
else if( bli_is_conjtrans( transa ) )
|
||||
cblas_transa = CblasConjTrans;
|
||||
else
|
||||
cblas_transa = CblasNoTrans;
|
||||
|
||||
if( bli_is_trans( transb ) )
|
||||
cblas_transb = CblasTrans;
|
||||
else if( bli_is_conjtrans( transb ) )
|
||||
cblas_transb = CblasConjTrans;
|
||||
else
|
||||
cblas_transb = CblasNoTrans;
|
||||
#else
|
||||
|
||||
f77_char f77_transa;
|
||||
f77_char f77_transb;
|
||||
f77_char f77_uploc;
|
||||
|
||||
bli_param_map_blis_to_netlib_trans( transa, &f77_transa );
|
||||
bli_param_map_blis_to_netlib_trans( transb, &f77_transb );
|
||||
bli_param_map_blis_to_netlib_uplo( uploc, &f77_uploc );
|
||||
#endif
|
||||
|
||||
if ( bli_is_float( dt ) )
|
||||
{
|
||||
#ifdef CBLAS
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_sgemmt( cblas_order,
|
||||
cblas_uplo,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc );
|
||||
|
||||
#else
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
float* alphap = bli_obj_buffer( &alpha );
|
||||
float* ap = bli_obj_buffer( &a );
|
||||
float* bp = bli_obj_buffer( &b );
|
||||
float* betap = bli_obj_buffer( &beta );
|
||||
float* cp = bli_obj_buffer( &c );
|
||||
|
||||
sgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
|
||||
#endif
|
||||
}
|
||||
else if ( bli_is_double( dt ) )
|
||||
{
|
||||
#ifdef CBLAS
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_dgemmt( cblas_order,
|
||||
cblas_uplo,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
nn,
|
||||
kk,
|
||||
*alphap,
|
||||
ap,lda,
|
||||
bp, ldb,
|
||||
*betap,
|
||||
cp, ldc
|
||||
);
|
||||
#else
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
double* alphap = bli_obj_buffer( &alpha );
|
||||
double* ap = bli_obj_buffer( &a );
|
||||
double* bp = bli_obj_buffer( &b );
|
||||
double* betap = bli_obj_buffer( &beta );
|
||||
double* cp = bli_obj_buffer( &c );
|
||||
|
||||
dgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
#endif
|
||||
}
|
||||
else if ( bli_is_scomplex( dt ) )
|
||||
{
|
||||
#ifdef CBLAS
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_cgemmt( cblas_order,
|
||||
cblas_uplo,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
nn,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
|
||||
#else
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
scomplex* alphap = bli_obj_buffer( &alpha );
|
||||
scomplex* ap = bli_obj_buffer( &a );
|
||||
scomplex* bp = bli_obj_buffer( &b );
|
||||
scomplex* betap = bli_obj_buffer( &beta );
|
||||
scomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
|
||||
#endif
|
||||
}
|
||||
else if ( bli_is_dcomplex( dt ) )
|
||||
{
|
||||
#ifdef CBLAS
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_row_stride( &a );
|
||||
f77_int ldb = bli_obj_row_stride( &b );
|
||||
f77_int ldc = bli_obj_row_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
cblas_zgemmt( cblas_order,
|
||||
cblas_uplo,
|
||||
cblas_transa,
|
||||
cblas_transb,
|
||||
nn,
|
||||
kk,
|
||||
alphap,
|
||||
ap, lda,
|
||||
bp, ldb,
|
||||
betap,
|
||||
cp, ldc );
|
||||
|
||||
#else
|
||||
|
||||
f77_int kk = bli_obj_width_after_trans( &a );
|
||||
f77_int nn = bli_obj_width( &c );
|
||||
f77_int lda = bli_obj_col_stride( &a );
|
||||
f77_int ldb = bli_obj_col_stride( &b );
|
||||
f77_int ldc = bli_obj_col_stride( &c );
|
||||
dcomplex* alphap = bli_obj_buffer( &alpha );
|
||||
dcomplex* ap = bli_obj_buffer( &a );
|
||||
dcomplex* bp = bli_obj_buffer( &b );
|
||||
dcomplex* betap = bli_obj_buffer( &beta );
|
||||
dcomplex* cp = bli_obj_buffer( &c );
|
||||
|
||||
zgemmt_( &f77_uploc,
|
||||
&f77_transa,
|
||||
&f77_transb,
|
||||
&nn,
|
||||
&kk,
|
||||
alphap,
|
||||
ap, &lda,
|
||||
bp, &ldb,
|
||||
betap,
|
||||
cp, &ldc );
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PRINT
|
||||
bli_printm( "c after", &c, "%4.1f", "" );
|
||||
exit(1);
|
||||
#endif
|
||||
|
||||
|
||||
dtime_save = bli_clock_min_diff( dtime_save, dtime );
|
||||
}
|
||||
|
||||
gflops = ( n * k * n ) / ( dtime_save * 1.0e9 );
|
||||
|
||||
if ( bli_is_complex( dt ) ) gflops *= 4.0;
|
||||
|
||||
#ifdef BLIS
|
||||
printf( "data_gemmt_blis" );
|
||||
#else
|
||||
printf( "data_gemmt_%s", BLAS );
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FILE_IN_OUT
|
||||
printf("%4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f\n", \
|
||||
( unsigned long )n,
|
||||
( unsigned long )k, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops );
|
||||
|
||||
|
||||
fprintf(fout, "%4lu \t %4lu \t %4lu \t %4lu \t %4lu \t %6.3f\n", \
|
||||
( unsigned long )n,
|
||||
( unsigned long )k, (unsigned long)cs_a, (unsigned long)cs_b, (unsigned long)cs_c, gflops );
|
||||
fflush(fout);
|
||||
|
||||
#else
|
||||
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %7.2f ];\n",
|
||||
( unsigned long )(p - p_begin)/p_inc + 1,
|
||||
( unsigned long )n,
|
||||
( unsigned long )k, gflops );
|
||||
#endif
|
||||
bli_obj_free( &alpha );
|
||||
bli_obj_free( &beta );
|
||||
|
||||
bli_obj_free( &a );
|
||||
bli_obj_free( &b );
|
||||
bli_obj_free( &c );
|
||||
bli_obj_free( &c_save );
|
||||
}
|
||||
|
||||
//bli_finalize();
|
||||
#ifdef FILE_IN_OUT
|
||||
fclose(fin);
|
||||
fclose(fout);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user