From 7f31a6307b7bd35f913c895947552c3a176f789b Mon Sep 17 00:00:00 2001 From: Francisco Igual Date: Sun, 27 Nov 2016 14:40:47 +0100 Subject: [PATCH 1/6] Fixed missing cntx argument in ARMv8 microkernels. --- kernels/armv8a/3/bli_gemm_opt_4x4.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernels/armv8a/3/bli_gemm_opt_4x4.c b/kernels/armv8a/3/bli_gemm_opt_4x4.c index 6199e461c..992750b93 100644 --- a/kernels/armv8a/3/bli_gemm_opt_4x4.c +++ b/kernels/armv8a/3/bli_gemm_opt_4x4.c @@ -57,7 +57,8 @@ void bli_sgemm_opt_8x12( float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + auxinfo_t* data, + cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -1106,7 +1107,8 @@ void bli_dgemm_opt_6x8( double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + auxinfo_t* data, + cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); @@ -2075,7 +2077,8 @@ void bli_cgemm_opt_4x4( scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + auxinfo_t* data, + cntx_t* restrict cntx ) { /* Just call the reference implementation. */ @@ -2085,7 +2088,8 @@ void bli_cgemm_opt_4x4( b, beta, c, rs_c, cs_c, - data ); + data, + cntx ); } void bli_zgemm_opt_4x4( @@ -2095,7 +2099,8 @@ void bli_zgemm_opt_4x4( dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, - auxinfo_t* data + auxinfo_t* data, + cntx_t* restrict cntx ) { /* Just call the reference implementation. */ @@ -2105,6 +2110,7 @@ void bli_zgemm_opt_4x4( b, beta, c, rs_c, cs_c, - data ); + data, + cntx ); } From 78e1b16e16d589ed31b2e712115ee282097f114d Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Fri, 27 Jan 2017 14:22:20 -0600 Subject: [PATCH 2/6] Change default threading parameters for KNL. --- config/knl/bli_kernel.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/knl/bli_kernel.h b/config/knl/bli_kernel.h index 677c46a16..e32954973 100644 --- a/config/knl/bli_kernel.h +++ b/config/knl/bli_kernel.h @@ -120,6 +120,12 @@ #endif +#define BLIS_DEFAULT_M_THREAD_RATIO 4 +#define BLIS_DEFAULT_N_THREAD_RATIO 1 + +#define BLIS_DEFAULT_MR_THREAD_MAX 1 +#define BLIS_DEFAULT_NR_THREAD_MAX 1 + /* #define BLIS_DEFAULT_MC_C 120 #define BLIS_DEFAULT_KC_C 240 From 018180c938c32efbeaaf626ba71ec5b780664db1 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Wed, 8 Feb 2017 11:20:52 -0600 Subject: [PATCH 3/6] Fixed a minor bug in configure (issue #114). Details: - Fixed a bug in the configure script whereby a non-preferred value for --enable-threading would cause problems in common.mk vis-a-vis detecting which threading model was chosen. Thanks to heroxbd for reporting this issue. --- configure | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configure b/configure index 3a1e296a7..2358575f6 100755 --- a/configure +++ b/configure @@ -493,7 +493,7 @@ main() fi - # Check the threading model flag. + # Check the threading model flag and standardize its value, if needed. # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. enable_openmp='no' enable_openmp_01=0 @@ -506,12 +506,14 @@ main() echo "${script_name}: using OpenMP for threading." enable_openmp='yes' enable_openmp_01=1 + threading_model="openmp" # Standardize the value. elif [ "x${threading_model}" = "xpthreads" ] || [ "x${threading_model}" = "xpthread" ] || [ "x${threading_model}" = "xposix" ]; then echo "${script_name}: using Pthreads for threading." enable_pthreads='yes' enable_pthreads_01=1 + threading_model="pthreads" # Standardize the value. elif [ "x${threading_model}" = "xno" ] || [ "x${threading_model}" = "xnone" ]; then echo "${script_name}: threading is disabled." From c362afc525bab4050581d1b0fcea2fe4d582c608 Mon Sep 17 00:00:00 2001 From: "Field G. Van Zee" Date: Thu, 9 Feb 2017 11:54:59 -0600 Subject: [PATCH 4/6] Added missing "level-0" BLAS [sd]cabs1_(). Details: - Fixed issue #115 by adding implementations for scabs1_() and dcabs1_() to the BLAS compatibility layer. Thanks to heroxbd for pointing out their absence. --- frame/compat/bli_blas.h | 5 +++ frame/compat/f2c/bla_cabs1.c | 62 ++++++++++++++++++++++++++++++++++++ frame/compat/f2c/bla_cabs1.h | 40 +++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 frame/compat/f2c/bla_cabs1.c create mode 100644 frame/compat/f2c/bla_cabs1.h diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h index 5583e456b..242a03b7a 100644 --- a/frame/compat/bli_blas.h +++ b/frame/compat/bli_blas.h @@ -79,6 +79,11 @@ #include "bla_xerbla.h" +// -- Level-0 BLAS prototypes -- + +#include "bla_cabs1.h" + + // -- Level-1 BLAS prototypes -- #include "bla_amax.h" diff --git a/frame/compat/f2c/bla_cabs1.c b/frame/compat/f2c/bla_cabs1.c new file mode 100644 index 000000000..20db46262 --- /dev/null +++ b/frame/compat/f2c/bla_cabs1.c @@ -0,0 +1,62 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_BLAS2BLIS + +/* scabs1.f -- translated by f2c (version 19991025). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + +/* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z) +{ + return bli_fabs( bli_creal( *z ) ) + + bli_fabs( bli_cimag( *z ) ); +} /* scabs1_ */ + +/* dcabs1.f -- translated by f2c (version 19991025). + You must link the resulting object file with the libraries: + -lf2c -lm (in that order) +*/ + +/* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z) +{ + return bli_fabs( bli_zreal( *z ) ) + + bli_fabs( bli_zimag( *z ) ); +} /* dcabs1_ */ + +#endif + diff --git a/frame/compat/f2c/bla_cabs1.h b/frame/compat/f2c/bla_cabs1.h new file mode 100644 index 000000000..a8c980d72 --- /dev/null +++ b/frame/compat/f2c/bla_cabs1.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef BLIS_ENABLE_BLAS2BLIS + +bla_real PASTEF77(s,cabs1)(bla_scomplex *z); +bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); + +#endif From 7d42fc0796ef0c010375fd8e59b1240ba41ce4d2 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Sun, 19 Feb 2017 21:10:55 -0500 Subject: [PATCH 5/6] Cast dim_t and inc_t parameters to 64-bit in KNL microkernels. --- kernels/x86_64/knl/1m/bli_packm_opt_24x8.c | 26 ++++++++++++----- kernels/x86_64/knl/1m/bli_packm_opt_30x8.c | 14 +++++++--- kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c | 28 +++++++++++-------- .../x86_64/knl/3/bli_sgemm_opt_30x16_knc.c | 12 ++++++-- 4 files changed, 54 insertions(+), 26 deletions(-) diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c index dba0e88b9..15ee67a5b 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -105,16 +105,22 @@ extern int32_t offsets[24]; void bli_dpackm_8xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( @@ -291,16 +297,22 @@ void bli_dpackm_8xk_opt void bli_dpackm_24xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c index eeab3c71d..181c5deab 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -133,16 +133,22 @@ extern int32_t offsets[32]; void bli_dpackm_30xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( diff --git a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c index 492e2009e..71e6d9327 100644 --- a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c +++ b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -181,22 +181,26 @@ extern int32_t offsets[24]; //#define MONITORS //#define LOOPMON void bli_dgemm_opt_24x8( - dim_t k, + dim_t k_, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { + (void)data; + (void)cntx; + const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; - - uint64_t k64 = k; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; @@ -204,7 +208,7 @@ void bli_dgemm_opt_24x8( #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif - + __asm__ volatile ( #ifdef MONITORS @@ -223,22 +227,22 @@ void bli_dgemm_opt_24x8( VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C - VMOVAPS(ZMM(17), ZMM(8)) - VMOVAPS(ZMM(18), ZMM(8)) + VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(18), ZMM(8)) VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c)) - VMOVAPS(ZMM(20), ZMM(8)) + VMOVAPS(ZMM(20), ZMM(8)) VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) #else - VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(21), ZMM(8)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(ZMM(23), ZMM(8)) -#endif +#endif VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3)) VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 @@ -670,7 +674,7 @@ void bli_dgemm_opt_24x8( [both] "=m" (both) #endif : // input operands - [k] "m" (k64), + [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), diff --git a/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c b/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c index acc7b341d..889fd8d19 100644 --- a/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c +++ b/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -166,20 +166,26 @@ int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5, //#define MONITORS //#define LOOPMON void bli_sgemm_opt_30x16_knc( - dim_t k, + dim_t k_, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { + (void)data; + (void)cntx; + const float * a_next = bli_auxinfo_next_a( data ); const float * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; From 0e18f68cf12eb9189ba901a20040b1cdae417670 Mon Sep 17 00:00:00 2001 From: Devin Matthews Date: Mon, 20 Feb 2017 09:03:21 -0600 Subject: [PATCH 6/6] Handle k=0 correctly in KNL dgemm ukernel. --- kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c index 71e6d9327..5ca50ced4 100644 --- a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c +++ b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c @@ -223,7 +223,7 @@ void bli_dgemm_opt_24x8( VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c - VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b + VMOVAPS(ZMM(14), ZMM(8)) VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C @@ -258,6 +258,11 @@ void bli_dgemm_opt_24x8( MOV(VAR(midh), EDX) #endif + TEST(RSI, RSI) + JZ(POSTACCUM) + + VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b + SUB(RSI, IMM(32)) JLE(TAIL)