mirror of
https://github.com/amd/blis.git
synced 2026-05-12 01:59:59 +00:00
Merge branch 'master' into 1m
This commit is contained in:
@@ -120,6 +120,12 @@
|
||||
|
||||
#endif
|
||||
|
||||
#define BLIS_DEFAULT_M_THREAD_RATIO 4
|
||||
#define BLIS_DEFAULT_N_THREAD_RATIO 1
|
||||
|
||||
#define BLIS_DEFAULT_MR_THREAD_MAX 1
|
||||
#define BLIS_DEFAULT_NR_THREAD_MAX 1
|
||||
|
||||
/*
|
||||
#define BLIS_DEFAULT_MC_C 120
|
||||
#define BLIS_DEFAULT_KC_C 240
|
||||
|
||||
4
configure
vendored
4
configure
vendored
@@ -493,7 +493,7 @@ main()
|
||||
fi
|
||||
|
||||
|
||||
# Check the threading model flag.
|
||||
# Check the threading model flag and standardize its value, if needed.
|
||||
# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
|
||||
enable_openmp='no'
|
||||
enable_openmp_01=0
|
||||
@@ -506,12 +506,14 @@ main()
|
||||
echo "${script_name}: using OpenMP for threading."
|
||||
enable_openmp='yes'
|
||||
enable_openmp_01=1
|
||||
threading_model="openmp" # Standardize the value.
|
||||
elif [ "x${threading_model}" = "xpthreads" ] ||
|
||||
[ "x${threading_model}" = "xpthread" ] ||
|
||||
[ "x${threading_model}" = "xposix" ]; then
|
||||
echo "${script_name}: using Pthreads for threading."
|
||||
enable_pthreads='yes'
|
||||
enable_pthreads_01=1
|
||||
threading_model="pthreads" # Standardize the value.
|
||||
elif [ "x${threading_model}" = "xno" ] ||
|
||||
[ "x${threading_model}" = "xnone" ]; then
|
||||
echo "${script_name}: threading is disabled."
|
||||
|
||||
@@ -79,6 +79,11 @@
|
||||
#include "bla_xerbla.h"
|
||||
|
||||
|
||||
// -- Level-0 BLAS prototypes --
|
||||
|
||||
#include "bla_cabs1.h"
|
||||
|
||||
|
||||
// -- Level-1 BLAS prototypes --
|
||||
|
||||
#include "bla_amax.h"
|
||||
|
||||
62
frame/compat/f2c/bla_cabs1.c
Normal file
62
frame/compat/f2c/bla_cabs1.c
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
/* scabs1.f -- translated by f2c (version 19991025).
|
||||
You must link the resulting object file with the libraries:
|
||||
-lf2c -lm (in that order)
|
||||
*/
|
||||
|
||||
/* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z)
|
||||
{
|
||||
return bli_fabs( bli_creal( *z ) ) +
|
||||
bli_fabs( bli_cimag( *z ) );
|
||||
} /* scabs1_ */
|
||||
|
||||
/* dcabs1.f -- translated by f2c (version 19991025).
|
||||
You must link the resulting object file with the libraries:
|
||||
-lf2c -lm (in that order)
|
||||
*/
|
||||
|
||||
/* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z)
|
||||
{
|
||||
return bli_fabs( bli_zreal( *z ) ) +
|
||||
bli_fabs( bli_zimag( *z ) );
|
||||
} /* dcabs1_ */
|
||||
|
||||
#endif
|
||||
|
||||
40
frame/compat/f2c/bla_cabs1.h
Normal file
40
frame/compat/f2c/bla_cabs1.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifdef BLIS_ENABLE_BLAS2BLIS
|
||||
|
||||
bla_real PASTEF77(s,cabs1)(bla_scomplex *z);
|
||||
bla_double PASTEF77(d,cabs1)(bla_dcomplex *z);
|
||||
|
||||
#endif
|
||||
@@ -57,7 +57,8 @@ void bli_sgemm_opt_8x12(
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
@@ -1106,7 +1107,8 @@ void bli_dgemm_opt_6x8(
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
void* a_next = bli_auxinfo_next_a( data );
|
||||
@@ -2075,7 +2077,8 @@ void bli_cgemm_opt_4x4(
|
||||
scomplex* restrict b,
|
||||
scomplex* restrict beta,
|
||||
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
@@ -2085,7 +2088,8 @@ void bli_cgemm_opt_4x4(
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
data,
|
||||
cntx );
|
||||
}
|
||||
|
||||
void bli_zgemm_opt_4x4(
|
||||
@@ -2095,7 +2099,8 @@ void bli_zgemm_opt_4x4(
|
||||
dcomplex* restrict b,
|
||||
dcomplex* restrict beta,
|
||||
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
/* Just call the reference implementation. */
|
||||
@@ -2105,6 +2110,7 @@ void bli_zgemm_opt_4x4(
|
||||
b,
|
||||
beta,
|
||||
c, rs_c, cs_c,
|
||||
data );
|
||||
data,
|
||||
cntx );
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
@@ -105,16 +105,22 @@ extern int32_t offsets[24];
|
||||
void bli_dpackm_8xk_opt
|
||||
(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
dim_t n_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca, inc_t lda,
|
||||
void* restrict p_, inc_t ldp
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_
|
||||
)
|
||||
{
|
||||
(void)conja;
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
double* a = (double*)a_;
|
||||
double* p = (double*)p_;
|
||||
double* kappa = (double*)kappa_;
|
||||
const int64_t n = n_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
@@ -291,16 +297,22 @@ void bli_dpackm_8xk_opt
|
||||
void bli_dpackm_24xk_opt
|
||||
(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
dim_t n_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca, inc_t lda,
|
||||
void* restrict p_, inc_t ldp
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_
|
||||
)
|
||||
{
|
||||
(void)conja;
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
double* a = (double*)a_;
|
||||
double* p = (double*)p_;
|
||||
double* kappa = (double*)kappa_;
|
||||
const int64_t n = n_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
@@ -133,16 +133,22 @@ extern int32_t offsets[32];
|
||||
void bli_dpackm_30xk_opt
|
||||
(
|
||||
conj_t conja,
|
||||
dim_t n,
|
||||
dim_t n_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca, inc_t lda,
|
||||
void* restrict p_, inc_t ldp
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_
|
||||
)
|
||||
{
|
||||
(void)conja;
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
double* a = (double*)a_;
|
||||
double* p = (double*)p_;
|
||||
double* kappa = (double*)kappa_;
|
||||
const int64_t n = n_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
@@ -181,22 +181,26 @@ extern int32_t offsets[24];
|
||||
//#define MONITORS
|
||||
//#define LOOPMON
|
||||
void bli_dgemm_opt_24x8(
|
||||
dim_t k,
|
||||
dim_t k_,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
double* restrict c, inc_t rs_c_, inc_t cs_c_,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
(void)data;
|
||||
(void)cntx;
|
||||
|
||||
const double * a_next = bli_auxinfo_next_a( data );
|
||||
const double * b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
|
||||
uint64_t k64 = k;
|
||||
const int64_t k = k_;
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
@@ -204,7 +208,7 @@ void bli_dgemm_opt_24x8(
|
||||
#ifdef LOOPMON
|
||||
int tlooph, tloopl, blooph, bloopl;
|
||||
#endif
|
||||
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
#ifdef MONITORS
|
||||
@@ -219,26 +223,26 @@ void bli_dgemm_opt_24x8(
|
||||
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
|
||||
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
|
||||
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
|
||||
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
|
||||
VMOVAPS(ZMM(14), ZMM(8))
|
||||
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
|
||||
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
|
||||
#if SCATTER_PREFETCH_C
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(18), ZMM(8))
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(18), ZMM(8))
|
||||
VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
|
||||
VMOVAPS(ZMM(20), ZMM(8))
|
||||
VMOVAPS(ZMM(20), ZMM(8))
|
||||
VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
|
||||
VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
|
||||
VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
|
||||
#else
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
|
||||
VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
|
||||
VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
|
||||
VMOVAPS(ZMM(21), ZMM(8))
|
||||
VMOVAPS(ZMM(22), ZMM(8))
|
||||
VMOVAPS(ZMM(23), ZMM(8))
|
||||
#endif
|
||||
#endif
|
||||
VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3))
|
||||
VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations
|
||||
VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
|
||||
@@ -254,6 +258,11 @@ void bli_dgemm_opt_24x8(
|
||||
MOV(VAR(midh), EDX)
|
||||
#endif
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
|
||||
|
||||
SUB(RSI, IMM(32))
|
||||
JLE(TAIL)
|
||||
|
||||
@@ -670,7 +679,7 @@ void bli_dgemm_opt_24x8(
|
||||
[both] "=m" (both)
|
||||
#endif
|
||||
: // input operands
|
||||
[k] "m" (k64),
|
||||
[k] "m" (k),
|
||||
[a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[alpha] "m" (alpha),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
@@ -166,20 +166,26 @@ int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5,
|
||||
//#define MONITORS
|
||||
//#define LOOPMON
|
||||
void bli_sgemm_opt_30x16_knc(
|
||||
dim_t k,
|
||||
dim_t k_,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
float* restrict c, inc_t rs_c_, inc_t cs_c_,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
(void)data;
|
||||
(void)cntx;
|
||||
|
||||
const float * a_next = bli_auxinfo_next_a( data );
|
||||
const float * b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
const int64_t k = k_;
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
|
||||
Reference in New Issue
Block a user