Merge branch 'master' into 1m

This commit is contained in:
Field G. Van Zee
2017-02-21 17:06:16 -06:00
10 changed files with 188 additions and 34 deletions

View File

@@ -120,6 +120,12 @@
#endif
#define BLIS_DEFAULT_M_THREAD_RATIO 4
#define BLIS_DEFAULT_N_THREAD_RATIO 1
#define BLIS_DEFAULT_MR_THREAD_MAX 1
#define BLIS_DEFAULT_NR_THREAD_MAX 1
/*
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 240

4
configure vendored
View File

@@ -493,7 +493,7 @@ main()
fi
# Check the threading model flag.
# Check the threading model flag and standardize its value, if needed.
# NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred.
enable_openmp='no'
enable_openmp_01=0
@@ -506,12 +506,14 @@ main()
echo "${script_name}: using OpenMP for threading."
enable_openmp='yes'
enable_openmp_01=1
threading_model="openmp" # Standardize the value.
elif [ "x${threading_model}" = "xpthreads" ] ||
[ "x${threading_model}" = "xpthread" ] ||
[ "x${threading_model}" = "xposix" ]; then
echo "${script_name}: using Pthreads for threading."
enable_pthreads='yes'
enable_pthreads_01=1
threading_model="pthreads" # Standardize the value.
elif [ "x${threading_model}" = "xno" ] ||
[ "x${threading_model}" = "xnone" ]; then
echo "${script_name}: threading is disabled."

View File

@@ -79,6 +79,11 @@
#include "bla_xerbla.h"
// -- Level-0 BLAS prototypes --
#include "bla_cabs1.h"
// -- Level-1 BLAS prototypes --
#include "bla_amax.h"

View File

@@ -0,0 +1,62 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_BLAS2BLIS
/* scabs1.f -- translated by f2c (version 19991025).
You must link the resulting object file with the libraries:
-lf2c -lm (in that order)
*/
/* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z)
{
return bli_fabs( bli_creal( *z ) ) +
bli_fabs( bli_cimag( *z ) );
} /* scabs1_ */
/* dcabs1.f -- translated by f2c (version 19991025).
You must link the resulting object file with the libraries:
-lf2c -lm (in that order)
*/
/* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z)
{
return bli_fabs( bli_zreal( *z ) ) +
bli_fabs( bli_zimag( *z ) );
} /* dcabs1_ */
#endif

View File

@@ -0,0 +1,40 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef BLIS_ENABLE_BLAS2BLIS
bla_real PASTEF77(s,cabs1)(bla_scomplex *z);
bla_double PASTEF77(d,cabs1)(bla_dcomplex *z);
#endif

View File

@@ -57,7 +57,8 @@ void bli_sgemm_opt_8x12(
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
auxinfo_t* data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
@@ -1106,7 +1107,8 @@ void bli_dgemm_opt_6x8(
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
auxinfo_t* data,
cntx_t* restrict cntx
)
{
void* a_next = bli_auxinfo_next_a( data );
@@ -2075,7 +2077,8 @@ void bli_cgemm_opt_4x4(
scomplex* restrict b,
scomplex* restrict beta,
scomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
auxinfo_t* data,
cntx_t* restrict cntx
)
{
/* Just call the reference implementation. */
@@ -2085,7 +2088,8 @@ void bli_cgemm_opt_4x4(
b,
beta,
c, rs_c, cs_c,
data );
data,
cntx );
}
void bli_zgemm_opt_4x4(
@@ -2095,7 +2099,8 @@ void bli_zgemm_opt_4x4(
dcomplex* restrict b,
dcomplex* restrict beta,
dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
auxinfo_t* data,
cntx_t* restrict cntx
)
{
/* Just call the reference implementation. */
@@ -2105,6 +2110,7 @@ void bli_zgemm_opt_4x4(
b,
beta,
c, rs_c, cs_c,
data );
data,
cntx );
}

View File

@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
@@ -105,16 +105,22 @@ extern int32_t offsets[24];
void bli_dpackm_8xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;
const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
@@ -291,16 +297,22 @@ void bli_dpackm_8xk_opt
void bli_dpackm_24xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;
const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(

View File

@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
@@ -133,16 +133,22 @@ extern int32_t offsets[32];
void bli_dpackm_30xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;
const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(

View File

@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
@@ -181,22 +181,26 @@ extern int32_t offsets[24];
//#define MONITORS
//#define LOOPMON
void bli_dgemm_opt_24x8(
dim_t k,
dim_t k_,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;
const double * a_next = bli_auxinfo_next_a( data );
const double * b_next = bli_auxinfo_next_b( data );
const int32_t * offsetPtr = &offsets[0];
uint64_t k64 = k;
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
@@ -204,7 +208,7 @@ void bli_dgemm_opt_24x8(
#ifdef LOOPMON
int tlooph, tloopl, blooph, bloopl;
#endif
__asm__ volatile
(
#ifdef MONITORS
@@ -219,26 +223,26 @@ void bli_dgemm_opt_24x8(
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
VMOVAPS(ZMM(14), ZMM(8))
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
#if SCATTER_PREFETCH_C
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8))
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8))
VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
VMOVAPS(ZMM(20), ZMM(8))
VMOVAPS(ZMM(20), ZMM(8))
VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
#else
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
VMOVAPS(ZMM(21), ZMM(8))
VMOVAPS(ZMM(22), ZMM(8))
VMOVAPS(ZMM(23), ZMM(8))
#endif
#endif
VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3))
VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations
VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
@@ -254,6 +258,11 @@ void bli_dgemm_opt_24x8(
MOV(VAR(midh), EDX)
#endif
TEST(RSI, RSI)
JZ(POSTACCUM)
VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
SUB(RSI, IMM(32))
JLE(TAIL)
@@ -670,7 +679,7 @@ void bli_dgemm_opt_24x8(
[both] "=m" (both)
#endif
: // input operands
[k] "m" (k64),
[k] "m" (k),
[a] "m" (a),
[b] "m" (b),
[alpha] "m" (alpha),

View File

@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
@@ -166,20 +166,26 @@ int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5,
//#define MONITORS
//#define LOOPMON
void bli_sgemm_opt_30x16_knc(
dim_t k,
dim_t k_,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;
const float * a_next = bli_auxinfo_next_a( data );
const float * b_next = bli_auxinfo_next_b( data );
const int32_t * offsetPtr = &offsets[0];
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;