diff --git a/Makefile b/Makefile index 6234e6825..84111d364 100644 --- a/Makefile +++ b/Makefile @@ -537,7 +537,7 @@ $(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $( ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes) $(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@ else - @echo "Compiling $$@" $(call get-kernel-text-for,$(1)) + @echo "Compiling $$@" $(call get-kernel-text-for,$(2)) @$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@ endif endef diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c index 05ee4128e..7331a8e9a 100644 --- a/config/knl/bli_cntx_init_knl.c +++ b/config/knl/bli_cntx_init_knl.c @@ -47,8 +47,9 @@ void bli_cntx_init_knl( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 1, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, + 2, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, cntx ); @@ -61,26 +62,77 @@ void bli_cntx_init_knl( cntx_t* cntx ) cntx ); + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 4, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 10, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + // axpyv +#if 0 + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, +#else + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, +#endif + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv +#if 0 + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, +#else + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, +#endif + cntx + ); + // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 24, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); - bli_blksz_init ( &blkszs[ BLIS_MC ], -1, 120, -1, -1, - -1, 144, -1, -1 ); - bli_blksz_init ( &blkszs[ BLIS_KC ], -1, 336, -1, -1, - -1, 420, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 14400, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 24, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_MC ], 240, 120, -1, -1, + 288, 144, -1, -1 ); + bli_blksz_init ( &blkszs[ BLIS_KC ], 336, 336, -1, -1, + 408, 408, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 14400, 14400, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + BLIS_NAT, 7, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); } diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c index bc23295ac..013d491b0 100644 --- a/config/skx/bli_cntx_init_skx.c +++ b/config/skx/bli_cntx_init_skx.c @@ -48,8 +48,54 @@ void bli_cntx_init_skx( cntx_t* cntx ) bli_cntx_set_l3_nat_ukrs ( 2, - BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE, + // gemm + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE, + cntx + ); + + // Update the context with optimized level-1f kernels. + bli_cntx_set_l1f_kers + ( + 4, + // axpyf + BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, + BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, + // dotxf + BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, + BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, + cntx + ); + + // Update the context with optimized level-1v kernels. + bli_cntx_set_l1v_kers + ( + 10, + // amaxv + BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, + BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, + // axpyv +#if 0 + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, +#else + BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, + BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, +#endif + // dotv + BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, + BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, + // dotxv + BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, + BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, + // scalv +#if 0 + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, +#else + BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, + BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, +#endif cntx ); @@ -59,19 +105,25 @@ void bli_cntx_init_skx( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 ); bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256, - 480, 480, 256, 256 ); + 480, 480, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( - BLIS_NAT, 5, + BLIS_NAT, 7, + // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + // level-1f + BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, + BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); } diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h index 2ef6042d5..abf85319a 100644 --- a/config/skx/bli_family_skx.h +++ b/config/skx/bli_family_skx.h @@ -50,10 +50,10 @@ #define BLIS_SIMD_SIZE 64 #define BLIS_SIMD_NUM_REGISTERS 32 -#include +//#include -#define BLIS_MALLOC_POOL malloc -#define BLIS_FREE_POOL free +//#define BLIS_MALLOC_POOL malloc +//#define BLIS_FREE_POOL free #if 0 diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c index d3c81f709..0898d7577 100644 --- a/config/zen/bli_cntx_init_zen.c +++ b/config/zen/bli_cntx_init_zen.c @@ -62,6 +62,7 @@ void bli_cntx_init_zen( cntx_t* cntx ) cntx ); + // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, @@ -115,8 +116,8 @@ void bli_cntx_init_zen( cntx_t* cntx ) bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); - bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); - bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. diff --git a/config_registry b/config_registry index 6ea7e21b5..8c5bbcc74 100644 --- a/config_registry +++ b/config_registry @@ -15,8 +15,8 @@ arm64: cortexa57 generic arm32: cortexa15 cortexa9 generic # Intel architectures. -skx: skx -knl: knl +skx: skx/skx/zen +knl: knl/knl/zen haswell: haswell/haswell/zen sandybridge: sandybridge penryn: penryn diff --git a/configure b/configure index f827cc5e9..763e69c87 100755 --- a/configure +++ b/configure @@ -614,7 +614,7 @@ build_kconfig_registry() for config in ${clist}; do - # Look up the kernel for the current sub-configuration. + # Look up the kernels for the current sub-configuration. #kernels="${kernel_registry[${config}]}" kernels=$(query_array "kernel_registry" ${config}) diff --git a/kernels/knl/3/bli_avx512_macros.h b/frame/include/bli_avx512_macros.h similarity index 100% rename from kernels/knl/3/bli_avx512_macros.h rename to frame/include/bli_avx512_macros.h diff --git a/kernels/knl/1m/bli_packm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c similarity index 98% rename from kernels/knl/1m/bli_packm_knl_asm_24x8.c rename to kernels/knl/1m/bli_dpackm_knl_asm_24x8.c index 3cf4bcc81..e88978000 100644 --- a/kernels/knl/1m/bli_packm_knl_asm_24x8.c +++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c @@ -32,7 +32,7 @@ */ -#include "../3/bli_avx512_macros.h" +#include "bli_avx512_macros.h" #include "blis.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ @@ -100,7 +100,9 @@ VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD)) //This is an array used for the scatter/gather instructions. -extern int32_t offsets[24]; +static int32_t offsets[32] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; void bli_dpackm_knl_asm_8xk ( diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c new file mode 100644 index 000000000..2a797ab36 --- /dev/null +++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c @@ -0,0 +1,563 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "bli_avx512_macros.h" +#include "blis.h" + +#include + +#define LOADMUL8x8(a,o,s1,s3,s5,s7, \ + z0,z1,z2,z3,z4,z5,z6,z7) \ + \ + VMULPS(YMM(z0), YMM(15), MEM(a, o)) \ + VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \ + VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \ + VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \ + VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \ + VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \ + VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \ + VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o)) + +#define STORE8x8(a,o,s, \ + z0,z1,z2,z3,z4,z5,z6,z7) \ + \ + VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \ + VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \ + VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \ + VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \ + VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \ + VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \ + VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \ + VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7)) + +#define STORETRANS8x8(a,o,s, \ + a0,a1,a2,a3,a4,a5,a6,a7, \ + t0,t1,t2,t3,t4,t5) \ + \ + VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \ + VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \ + VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \ + VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \ + \ + VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \ + VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \ + VMOVUPS(MEM(a,(o )+0*(s)), XMM(t4)) \ + VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \ + VEXTRACTF128(MEM(a,(o )+4*(s)), YMM(t4), IMM(1)) \ + VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \ + \ + VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \ + VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \ + VMOVUPS(MEM(a,(o )+1*(s)), XMM(t4)) \ + VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \ + VEXTRACTF128(MEM(a,(o )+5*(s)), YMM(t4), IMM(1)) \ + VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \ + \ + VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \ + VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \ + VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \ + VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \ + \ + VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \ + VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \ + VMOVUPS(MEM(a,(o )+2*(s)), XMM(t4)) \ + VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \ + VEXTRACTF128(MEM(a,(o )+6*(s)), YMM(t4), IMM(1)) \ + VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \ + \ + VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \ + VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \ + VMOVUPS(MEM(a,(o )+3*(s)), XMM(t4)) \ + VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \ + VEXTRACTF128(MEM(a,(o )+7*(s)), YMM(t4), IMM(1)) \ + VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1)) + +//This is an array used for the scatter/gather instructions. +static int32_t offsets[32] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + +void bli_spackm_knl_asm_16xk + ( + conj_t conja, + dim_t n_, + void* restrict kappa_, + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_, + cntx_t* restrict ctnx + ) +{ + (void)conja; + + const int32_t * offsetPtr = &offsets[0]; + float* a = (float*)a_; + float* p = (float*)p_; + float* kappa = (float*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + __asm__ volatile + ( + MOV(RSI, VAR(n)) + MOV(RAX, VAR(a)) + MOV(RBX, VAR(inca)) + MOV(RCX, VAR(lda)) + MOV(R14, VAR(p)) + + TEST(RSI, RSI) + JZ(PACK16_DONE) + + LEA(RBX, MEM(,RBX,4)) //inca in bytes + LEA(RCX, MEM(,RCX,4)) //lda in bytes + + VBROADCASTSS(YMM(15), VAR(kappa)) + + CMP(RBX, IMM(4)) + JNE(PACK16_T) + + LABEL(PACK16_N) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK16_N_TAIL) + + LEA(R8, MEM(RCX,RCX,2)) //lda*3 + LEA(R9, MEM(RCX,RCX,4)) //lda*5 + LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + + LABEL(PACK16_N_LOOP) + + LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7) + + LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(R14, MEM(R14,16*8*4)) + + SUB(RSI, IMM(1)) + + JNZ(PACK16_N_LOOP) + + TEST(RDX, RDX) + JZ(PACK16_DONE) + + LABEL(PACK16_N_TAIL) + + VMULPS(YMM(0), YMM(15), MEM(RAX )) + VMULPS(YMM(1), YMM(15), MEM(RAX,32)) + VMOVUPS(MEM(R14 ), YMM(0)) + VMOVUPS(MEM(R14,32), YMM(1)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14, 16*4)) + + SUB(RDX, IMM(1)) + + JNZ(PACK16_N_TAIL) + + JMP(PACK16_DONE) + + LABEL(PACK16_T) + + CMP(RCX, IMM(4)) + JNE(PACK16_G) + + LEA(R8, MEM(RBX,RBX,2)) //inca*3 + LEA(R9, MEM(RBX,RBX,4)) //inca*5 + LEA(R10, MEM(R8 ,RBX,4)) //inca*7 + LEA(R11, MEM(RAX,RBX,8)) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK16_T_TAIL) + + LABEL(PACK16_T_LOOP) + + LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) + + LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) + + LEA(RAX, MEM(RAX, 8*4)) + LEA(R11, MEM(R11, 8*4)) + LEA(R14, MEM(R14,16*8*4)) + + SUB(RSI, IMM(1)) + + JNZ(PACK16_T_LOOP) + + TEST(RDX, RDX) + JZ(PACK16_DONE) + + LABEL(PACK16_T_TAIL) + + VMULSS(XMM(0), XMM(15), MEM(RAX )) + VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1)) + VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2)) + VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1)) + VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4)) + VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1)) + VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2)) + VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1)) + VMOVSS(MEM(R14,0*4), XMM(0)) + VMOVSS(MEM(R14,1*4), XMM(1)) + VMOVSS(MEM(R14,2*4), XMM(2)) + VMOVSS(MEM(R14,3*4), XMM(3)) + VMOVSS(MEM(R14,4*4), XMM(4)) + VMOVSS(MEM(R14,5*4), XMM(5)) + VMOVSS(MEM(R14,6*4), XMM(6)) + VMOVSS(MEM(R14,7*4), XMM(7)) + + VMULSS(XMM(0), XMM(15), MEM(R11 )) + VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1)) + VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2)) + VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1)) + VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4)) + VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1)) + VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2)) + VMULSS(XMM(7), XMM(15), MEM(R11,R10,1)) + VMOVSS(MEM(R14, 8*4), XMM(0)) + VMOVSS(MEM(R14, 9*4), XMM(1)) + VMOVSS(MEM(R14,10*4), XMM(2)) + VMOVSS(MEM(R14,11*4), XMM(3)) + VMOVSS(MEM(R14,12*4), XMM(4)) + VMOVSS(MEM(R14,13*4), XMM(5)) + VMOVSS(MEM(R14,14*4), XMM(6)) + VMOVSS(MEM(R14,15*4), XMM(7)) + + LEA(RAX, MEM(RAX, 4)) + LEA(R11, MEM(R11, 4)) + LEA(R14, MEM(R14,16*4)) + + SUB(RDX, IMM(1)) + + JNZ(PACK16_T_TAIL) + + JMP(PACK16_DONE) + + LABEL(PACK16_G) + + VPBROADCASTD(ZMM(3), VAR(inca)) + MOV(RBX, VAR(offsetPtr)) + VPMULLD(ZMM(0), ZMM(3), MEM(RBX)) + + LABEL(PACK16_G_LOOP) + + KXNORW(K(1), K(0), K(0)) + VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8)) + VMULPS(ZMM(3), ZMM(3), ZMM(15)) + VMOVUPS(MEM(R14), ZMM(3)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14, 16*4)) + + SUB(RSI, IMM(1)) + + JNZ(PACK16_G_LOOP) + + LABEL(PACK16_DONE) + + : //output operands + : //input operands + [n] "m" (n), + [kappa] "m" (*kappa), + [a] "m" (a), + [inca] "m" (inca), + [lda] "m" (lda), + [p] "m" (p), + [ldp] "m" (ldp), + [offsetPtr] "m" (offsetPtr) + : //clobbers + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", + "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", + "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + ); +} + +void bli_spackm_knl_asm_24xk + ( + conj_t conja, + dim_t n_, + void* restrict kappa_, + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_, + cntx_t* restrict ctnx + ) +{ + (void)conja; + + const int32_t * offsetPtr = &offsets[0]; + float* a = (float*)a_; + float* p = (float*)p_; + float* kappa = (float*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + __asm__ volatile + ( + MOV(RSI, VAR(n)) + MOV(RAX, VAR(a)) + MOV(RBX, VAR(inca)) + MOV(RCX, VAR(lda)) + MOV(R14, VAR(p)) + MOV(RDI, VAR(ldp)) + + TEST(RSI, RSI) + JZ(PACK24_DONE) + + LEA(RBX, MEM(,RBX,4)) //inca in bytes + LEA(RCX, MEM(,RCX,4)) //lda in bytes + LEA(RDI, MEM(,RDI,4)) //ldp in bytes + + VBROADCASTSS(ZMM(15), VAR(kappa)) + + CMP(RBX, IMM(4)) + JNE(PACK24_T) + + LABEL(PACK24_N) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK24_N_TAIL) + + LEA(R8, MEM(RCX,RCX,2)) //lda*3 + LEA(R9, MEM(RCX,RCX,4)) //lda*5 + LEA(R10, MEM(R8 ,RCX,4)) //lda*7 + + LABEL(PACK24_N_LOOP) + + LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7) + + LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7) + + LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(R14, MEM(R14,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_N_LOOP) + + TEST(RDX, RDX) + JZ(PACK24_DONE) + + LABEL(PACK24_N_TAIL) + + VMULPS(ZMM(0), ZMM(15), MEM(RAX)) + VMOVUPS(MEM(R14), ZMM(0)) + + VMULPS(YMM(1), YMM(15), MEM(RAX,64)) + VMOVUPS(MEM(R14,64), YMM(1)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14,RDI,1)) + + SUB(RDX, IMM(1)) + + JNZ(PACK24_N_TAIL) + + JMP(PACK24_DONE) + + LABEL(PACK24_T) + + CMP(RCX, IMM(4)) + JNE(PACK24_G) + + LEA(R8, MEM(RBX,RBX,2)) //inca*3 + LEA(R9, MEM(RBX,RBX,4)) //inca*5 + LEA(R10, MEM(R8 ,RBX,4)) //inca*7 + LEA(R11, MEM(RAX,RBX,8)) + LEA(R12, MEM(R11,RBX,8)) + + MOV(RDX, RSI) + AND(RDX, IMM(7)) + SAR(RSI, IMM(3)) + JZ(PACK24_T_TAIL) + + LABEL(PACK24_T_LOOP) + + LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) + + LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) + + LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) + STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) + + LEA(RAX, MEM(RAX,RCX,8)) + LEA(R11, MEM(R11,RCX,8)) + LEA(R12, MEM(R12,RCX,8)) + LEA(R14, MEM(R14,RDI,8)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_T_LOOP) + + TEST(RDX, RDX) + JZ(PACK24_DONE) + + LABEL(PACK24_T_TAIL) + + VMULSS(XMM(0), XMM(15), MEM(RAX)) + VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1)) + VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2)) + VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1)) + VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4)) + VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1)) + VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2)) + VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1)) + VMOVSS(MEM(R14,0*4), XMM(0)) + VMOVSS(MEM(R14,1*4), XMM(1)) + VMOVSS(MEM(R14,2*4), XMM(2)) + VMOVSS(MEM(R14,3*4), XMM(3)) + VMOVSS(MEM(R14,4*4), XMM(4)) + VMOVSS(MEM(R14,5*4), XMM(5)) + VMOVSS(MEM(R14,6*4), XMM(6)) + VMOVSS(MEM(R14,7*4), XMM(7)) + + VMULSS(XMM(0), XMM(15), MEM(R11)) + VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1)) + VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2)) + VMULSS(XMM(3), XMM(15), MEM(R11,R8,1)) + VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4)) + VMULSS(XMM(5), XMM(15), MEM(R11,R9,1)) + VMULSS(XMM(6), XMM(15), MEM(R11,R8,2)) + VMULSS(XMM(7), XMM(15), MEM(R11,R10,1)) + VMOVSS(MEM(R14, 8*4), XMM(0)) + VMOVSS(MEM(R14, 9*4), XMM(1)) + VMOVSS(MEM(R14,10*4), XMM(2)) + VMOVSS(MEM(R14,11*4), XMM(3)) + VMOVSS(MEM(R14,12*4), XMM(4)) + VMOVSS(MEM(R14,13*4), XMM(5)) + VMOVSS(MEM(R14,14*4), XMM(6)) + VMOVSS(MEM(R14,15*4), XMM(7)) + + VMULSS(XMM(0), XMM(15), MEM(R12)) + VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1)) + VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2)) + VMULSS(XMM(3), XMM(15), MEM(R12,R8,1)) + VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4)) + VMULSS(XMM(5), XMM(15), MEM(R12,R9,1)) + VMULSS(XMM(6), XMM(15), MEM(R12,R8,2)) + VMULSS(XMM(7), XMM(15), MEM(R12,R10,1)) + VMOVSS(MEM(R14,16*4), XMM(0)) + VMOVSS(MEM(R14,17*4), XMM(1)) + VMOVSS(MEM(R14,18*4), XMM(2)) + VMOVSS(MEM(R14,19*4), XMM(3)) + VMOVSS(MEM(R14,20*4), XMM(4)) + VMOVSS(MEM(R14,21*4), XMM(5)) + VMOVSS(MEM(R14,22*4), XMM(6)) + VMOVSS(MEM(R14,23*4), XMM(7)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R11, MEM(R11,RCX,1)) + LEA(R12, MEM(R12,RCX,1)) + LEA(R14, MEM(R14,RDI,1)) + + SUB(RDX, IMM(1)) + + JNZ(PACK24_T_TAIL) + + JMP(PACK24_DONE) + + LABEL(PACK24_G) + + VPBROADCASTD(ZMM(3), VAR(inca)) + MOV(RBX, VAR(offsetPtr)) + VPMULLD(ZMM(0), ZMM(3), MEM(RBX)) + + LEA(R11, MEM(RAX,RBX,8)) + LEA(R11, MEM(R11,RBX,8)) + + LABEL(PACK24_G_LOOP) + + KXNORW(K(1), K(0), K(0)) + KSHIFTRW(K(2), K(1), IMM(8)) + VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8)) + VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8)) + VMULPS(ZMM(3), ZMM(3), ZMM(15)) + VMULPS(YMM(4), YMM(4), YMM(15)) + VMOVUPS(MEM(R14), ZMM(3)) + VMOVUPS(MEM(R14,64), YMM(4)) + + LEA(RAX, MEM(RAX,RCX,1)) + LEA(R14, MEM(R14,RDI,1)) + + SUB(RSI, IMM(1)) + + JNZ(PACK24_G_LOOP) + + LABEL(PACK24_DONE) + + : //output operands + : //input operands + [n] "m" (n), + [kappa] "m" (*kappa), + [a] "m" (a), + [inca] "m" (inca), + [lda] "m" (lda), + [p] "m" (p), + [ldp] "m" (ldp), + [offsetPtr] "m" (offsetPtr) + : //clobbers + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", + "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", + "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", + "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" + ); +} diff --git a/kernels/knl/1m/bli_packm_knl_asm_30x8.c b/kernels/knl/1m/old/bli_packm_knl_asm_30x8.c similarity index 99% rename from kernels/knl/1m/bli_packm_knl_asm_30x8.c rename to kernels/knl/1m/old/bli_packm_knl_asm_30x8.c index c2284c479..d1815be5a 100644 --- a/kernels/knl/1m/bli_packm_knl_asm_30x8.c +++ b/kernels/knl/1m/old/bli_packm_knl_asm_30x8.c @@ -32,7 +32,7 @@ */ -#include "../3/bli_avx512_macros.h" +#include "bli_avx512_macros.h" #include "blis.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c index 3d5adb84d..6fa5a29c1 100644 --- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c +++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c @@ -176,7 +176,9 @@ PREFETCH_B_L2(n) //This is an array used for the scatter/gather instructions. -extern int32_t offsets[24]; +static int32_t offsets[32] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; //#define MONITORS //#define LOOPMON @@ -224,7 +226,7 @@ void bli_dgemm_knl_asm_24x8 VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c - VMOVAPS(ZMM(14), ZMM(8)) + VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C @@ -259,11 +261,6 @@ void bli_dgemm_knl_asm_24x8 MOV(VAR(midh), EDX) #endif - TEST(RSI, RSI) - JZ(POSTACCUM) - - VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b - SUB(RSI, IMM(32)) JLE(TAIL) @@ -535,6 +532,7 @@ void bli_dgemm_knl_asm_24x8 MOV(RDX, RCX) ADD(RSI, IMM(32)) + JZ(POSTACCUM) LABEL(TAIL_LOOP) diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c new file mode 100644 index 000000000..060579f02 --- /dev/null +++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c @@ -0,0 +1,708 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include + +#include "bli_avx512_macros.h" + +#define UNROLL_K 32 + +#define SCATTER_PREFETCH_C 1 + +#define PREFETCH_A_L2 0 +#define PREFETCH_B_L2 0 +#define L2_PREFETCH_DIST 64 + +#define A_L1_PREFETCH_DIST 36 +#define B_L1_PREFETCH_DIST 18 + +#define LOOP_ALIGN ALIGN16 + +#define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \ +\ + VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ + VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX )) \ + VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ + VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \ + VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \ + VMOVUPS(MEM(RCX ), ZMM(R1)) \ + VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \ + VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \ + VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,4)) + +#define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \ +\ + VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ + VMOVUPS(MEM(RCX ), ZMM(R1)) \ + VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \ + VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \ + VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,4)) + +#define UPDATE_C_ROW_SCATTERED(NUM) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \ + VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \ + VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \ + VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \ + ADD(RCX, RAX) + +#define UPDATE_C_BZ_ROW_SCATTERED(NUM) \ +\ + KXNORW(K(1), K(0), K(0)) \ + VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \ + VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \ + ADD(RCX, RAX) + +#define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4)) +#define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64)) + +#if PREFETCH_A_L2 +#undef PREFETCH_A_L2 + +#define PREFETCH_A_L2(n) \ +\ + PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \ + PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64)) + +#else +#undef PREFETCH_A_L2 +#define PREFETCH_A_L2(...) +#endif + +#define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4)) + +#if PREFETCH_B_L2 +#undef PREFETCH_B_L2 + +#define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4)) + +#else +#undef PREFETCH_B_L2 +#define PREFETCH_B_L2(...) +#endif + +#define PREFETCH_C_L1_1 +#define PREFETCH_C_L1_2 +#define PREFETCH_C_L1_3 + +// +// n: index in unrolled loop +// +// a: ZMM register to load into +// b: ZMM register to read from +// +// ...: addressing for A, except for offset +// +#define SUBITER(n,a,b,...) \ +\ + PREFETCH_A_L2(n) \ +\ + VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \ + VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \ + VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \ + VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \ + PREFETCH_A_L1_1(n) \ + VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \ + VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \ + VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \ + PREFETCH_C_L1_1 \ + VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \ + VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \ + VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \ + PREFETCH_A_L1_2(n) \ + VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \ + VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \ + VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \ + PREFETCH_C_L1_2 \ + VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \ + VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \ + VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \ + PREFETCH_C_L1_3 \ + VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \ + VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \ + VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \ + PREFETCH_B_L1(n) \ + VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \ + VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \ + VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \ + PREFETCH_B_L2(n) \ + VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \ + VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \ + VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4)) + +//This is an array used for the scatter/gather instructions. +static int32_t offsets[32] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + +//#define MONITORS +//#define LOOPMON +void bli_sgemm_knl_asm_24x16 + ( + dim_t k_, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) +{ + (void)data; + (void)cntx; + + const double * a_next = bli_auxinfo_next_a( data ); + const double * b_next = bli_auxinfo_next_b( data ); + + const int32_t * offsetPtr = &offsets[0]; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; + +#ifdef MONITORS + int toph, topl, both, botl, midl, midh, mid2l, mid2h; +#endif +#ifdef LOOPMON + int tlooph, tloopl, blooph, bloopl; +#endif + + __asm__ volatile + ( +#ifdef MONITORS + RDTSC + MOV(VAR(topl), EAX) + MOV(VAR(toph), EDX) +#endif + + VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers + VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c)) + VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index + VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a + VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b + VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c + VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b + VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) + VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) +#if SCATTER_PREFETCH_C + VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(18), ZMM(8)) + VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c)) + VMOVAPS(ZMM(20), ZMM(8)) + VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) + VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) + VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) +#else + VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) + VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) + VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) + VMOVAPS(ZMM(21), ZMM(8)) + VMOVAPS(ZMM(22), ZMM(8)) + VMOVAPS(ZMM(23), ZMM(8)) +#endif + VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(2)) + VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*4)) //offset for 4 iterations + VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 + VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5 + VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7 + VMOVAPS(ZMM(29), ZMM(8)) + VMOVAPS(ZMM(30), ZMM(8)) + VMOVAPS(ZMM(31), ZMM(8)) + +#ifdef MONITORS + RDTSC + MOV(VAR(midl), EAX) + MOV(VAR(midh), EDX) +#endif + + SUB(RSI, IMM(32)) + JLE(TAIL) + + //prefetch C into L2 +#if SCATTER_PREFETCH_C + ADD(RSI, IMM(24)) + KXNORW(K(1), K(0), K(0)) + KXNORW(K(2), K(0), K(0)) + VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) + VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) +#else + PREFETCHW1(MEM(RCX )) + SUBITER( 0,1,0,RAX ) + PREFETCHW1(MEM(RCX,R12,1)) + SUBITER( 1,0,1,RAX ) + PREFETCHW1(MEM(RCX,R12,2)) + SUBITER( 2,1,0,RAX ) + PREFETCHW1(MEM(RCX,R13,1)) + SUBITER( 3,0,1,RAX ) + PREFETCHW1(MEM(RCX,R12,4)) + SUBITER( 4,1,0,RAX,R8, 1) + PREFETCHW1(MEM(RCX,R14,1)) + SUBITER( 5,0,1,RAX,R8, 1) + PREFETCHW1(MEM(RCX,R13,2)) + SUBITER( 6,1,0,RAX,R8, 1) + PREFETCHW1(MEM(RCX,R15,1)) + SUBITER( 7,0,1,RAX,R8, 1) + + LEA(RDX, MEM(RCX,R12,8)) + + PREFETCHW1(MEM(RDX )) + SUBITER( 8,1,0,RAX,R8, 2) + PREFETCHW1(MEM(RDX,R12,1)) + SUBITER( 9,0,1,RAX,R8, 2) + PREFETCHW1(MEM(RDX,R12,2)) + SUBITER(10,1,0,RAX,R8, 2) + PREFETCHW1(MEM(RDX,R13,1)) + SUBITER(11,0,1,RAX,R8, 2) + PREFETCHW1(MEM(RDX,R12,4)) + SUBITER(12,1,0,RAX,R9, 1) + PREFETCHW1(MEM(RDX,R14,1)) + SUBITER(13,0,1,RAX,R9, 1) + PREFETCHW1(MEM(RDX,R13,2)) + SUBITER(14,1,0,RAX,R9, 1) + PREFETCHW1(MEM(RDX,R15,1)) + SUBITER(15,0,1,RAX,R9, 1) + + LEA(RDI, MEM(RDX,R12,8)) + + PREFETCHW1(MEM(RDI )) + SUBITER(16,1,0,RAX,R8, 4) + PREFETCHW1(MEM(RDI,R12,1)) + SUBITER(17,0,1,RAX,R8, 4) + PREFETCHW1(MEM(RDI,R12,2)) + SUBITER(18,1,0,RAX,R8, 4) + PREFETCHW1(MEM(RDI,R13,1)) + SUBITER(19,0,1,RAX,R8, 4) + PREFETCHW1(MEM(RDI,R12,4)) + SUBITER(20,1,0,RAX,R10,1) + PREFETCHW1(MEM(RDI,R14,1)) + SUBITER(21,0,1,RAX,R10,1) + PREFETCHW1(MEM(RDI,R13,2)) + SUBITER(22,1,0,RAX,R10,1) + PREFETCHW1(MEM(RDI,R15,1)) + SUBITER(23,0,1,RAX,R10,1) + + ADD(RAX, IMM(24*24*4)) + ADD(RBX, IMM(24*16*4)) +#endif + + MOV(RDI, RSI) + AND(RDI, IMM(31)) + SAR(RSI, IMM(5)) + JZ(REM_1) + + LOOP_ALIGN + LABEL(MAIN_LOOP) + + SUBITER( 0,1,0,RAX ) + SUBITER( 1,0,1,RAX ) + SUBITER( 2,1,0,RAX ) + SUBITER( 3,0,1,RAX ) + SUBITER( 4,1,0,RAX,R8, 1) + SUBITER( 5,0,1,RAX,R8, 1) + SUBITER( 6,1,0,RAX,R8, 1) + SUBITER( 7,0,1,RAX,R8, 1) + SUBITER( 8,1,0,RAX,R8, 2) + SUBITER( 9,0,1,RAX,R8, 2) + SUBITER(10,1,0,RAX,R8, 2) + SUBITER(11,0,1,RAX,R8, 2) + SUBITER(12,1,0,RAX,R9, 1) + SUBITER(13,0,1,RAX,R9, 1) + SUBITER(14,1,0,RAX,R9, 1) + SUBITER(15,0,1,RAX,R9, 1) + SUBITER(16,1,0,RAX,R8, 4) + SUBITER(17,0,1,RAX,R8, 4) + SUBITER(18,1,0,RAX,R8, 4) + SUBITER(19,0,1,RAX,R8, 4) + SUBITER(20,1,0,RAX,R10,1) + SUBITER(21,0,1,RAX,R10,1) + SUBITER(22,1,0,RAX,R10,1) + SUBITER(23,0,1,RAX,R10,1) + SUBITER(24,1,0,RAX,R9, 2) + SUBITER(25,0,1,RAX,R9, 2) + SUBITER(26,1,0,RAX,R9, 2) + SUBITER(27,0,1,RAX,R9, 2) + SUBITER(28,1,0,RAX,R11,1) + SUBITER(29,0,1,RAX,R11,1) + SUBITER(30,1,0,RAX,R11,1) + SUBITER(31,0,1,RAX,R11,1) + + ADD(RAX, IMM(32*24*4)) + ADD(RBX, IMM(32*16*4)) + + SUB(RSI, IMM(1)) + + JNZ(MAIN_LOOP) + + LABEL(REM_1) + SAR1(RDI) + JNC(REM_2) + + SUBITER(0,1,0,RAX) + VMOVAPD(ZMM(0), ZMM(1)) + ADD(RAX, IMM(24*4)) + ADD(RBX, IMM(16*4)) + + LABEL(REM_2) + SAR1(RDI) + JNC(REM_4) + + SUBITER(0,1,0,RAX) + SUBITER(1,0,1,RAX) + ADD(RAX, IMM(2*24*4)) + ADD(RBX, IMM(2*16*4)) + + LABEL(REM_4) + SAR1(RDI) + JNC(REM_8) + + SUBITER(0,1,0,RAX) + SUBITER(1,0,1,RAX) + SUBITER(2,1,0,RAX) + SUBITER(3,0,1,RAX) + ADD(RAX, IMM(4*24*4)) + ADD(RBX, IMM(4*16*4)) + + LABEL(REM_8) + SAR1(RDI) + JNC(REM_16) + + SUBITER(0,1,0,RAX ) + SUBITER(1,0,1,RAX ) + SUBITER(2,1,0,RAX ) + SUBITER(3,0,1,RAX ) + SUBITER(4,1,0,RAX,R8,1) + SUBITER(5,0,1,RAX,R8,1) + SUBITER(6,1,0,RAX,R8,1) + SUBITER(7,0,1,RAX,R8,1) + ADD(RAX, IMM(8*24*4)) + ADD(RBX, IMM(8*16*4)) + + LABEL(REM_16) + SAR1(RDI) + JNC(AFTER_LOOP) + + SUBITER( 0,1,0,RAX ) + SUBITER( 1,0,1,RAX ) + SUBITER( 2,1,0,RAX ) + SUBITER( 3,0,1,RAX ) + SUBITER( 4,1,0,RAX,R8, 1) + SUBITER( 5,0,1,RAX,R8, 1) + SUBITER( 6,1,0,RAX,R8, 1) + SUBITER( 7,0,1,RAX,R8, 1) + SUBITER( 8,1,0,RAX,R8, 2) + SUBITER( 9,0,1,RAX,R8, 2) + SUBITER(10,1,0,RAX,R8, 2) + SUBITER(11,0,1,RAX,R8, 2) + SUBITER(12,1,0,RAX,R9, 1) + SUBITER(13,0,1,RAX,R9, 1) + SUBITER(14,1,0,RAX,R9, 1) + SUBITER(15,0,1,RAX,R9, 1) + ADD(RAX, IMM(16*24*4)) + ADD(RBX, IMM(16*16*4)) + + LABEL(AFTER_LOOP) + + //prefetch C into L1 +#if SCATTER_PREFETCH_C + KXNORW(K(1), K(0), K(0)) + KXNORW(K(2), K(0), K(0)) + VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1)) + VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2)) + + SUBITER(0,1,0,RAX ) + SUBITER(1,0,1,RAX ) + SUBITER(2,1,0,RAX ) + SUBITER(3,0,1,RAX ) + SUBITER(4,1,0,RAX,R8,1) + SUBITER(5,0,1,RAX,R8,1) + SUBITER(6,1,0,RAX,R8,1) + SUBITER(7,0,1,RAX,R8,1) +#else + + LEA(RDX, MEM(RCX,R12,8)) + LEA(RDI, MEM(RDX,R12,8)) + +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX )) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2)) + SUBITER(0,1,0,RAX ) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1)) + SUBITER(1,0,1,RAX ) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX )) + SUBITER(2,1,0,RAX ) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1)) + SUBITER(3,0,1,RAX ) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) + SUBITER(4,1,0,RAX,R8,1) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI )) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1)) + SUBITER(5,0,1,RAX,R8,1) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4)) + SUBITER(6,1,0,RAX,R8,1) +#undef PREFETCH_C_L1_1 +#undef PREFETCH_C_L1_2 +#undef PREFETCH_C_L1_3 +#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1)) +#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2)) +#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1)) + SUBITER(7,0,1,RAX,R8,1) +#endif + + JMP(POSTACCUM) + + LABEL(TAIL) + + MOV(RDX, RCX) + ADD(RSI, IMM(32)) + JZ(POSTACCUM) + + LABEL(TAIL_LOOP) + + PREFETCHW0(MEM(RDX)) + ADD(RDX, R12) + + SUBITER(0,1,0,RAX) + VMOVAPD(ZMM(0), ZMM(1)) + ADD(RAX, IMM(24*4)) + ADD(RBX, IMM(16*4)) + + SUB(RSI, IMM(1)) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + +#ifdef MONITORS + RDTSC + MOV(VAR(mid2l), EAX) + MOV(VAR(mid2h), EDX) +#endif + + MOV(RAX, VAR(alpha)) + MOV(RBX, VAR(beta)) + VBROADCASTSS(ZMM(0), MEM(RAX)) + VBROADCASTSS(ZMM(1), MEM(RBX)) + + // Check if C is row stride. If not, jump to the slow scattered update + MOV(RAX, VAR(rs_c)) + LEA(RAX, MEM(,RAX,4)) + MOV(RBX, VAR(cs_c)) + LEA(RDI, MEM(RAX,RAX,2)) + CMP(RBX, IMM(1)) + JNE(SCATTEREDUPDATE) + + VMOVD(EDX, XMM(1)) + SAL1(EDX) //shift out sign bit + JZ(COLSTORBZ) + + UPDATE_C_FOUR_ROWS( 8, 9,10,11) + UPDATE_C_FOUR_ROWS(12,13,14,15) + UPDATE_C_FOUR_ROWS(16,17,18,19) + UPDATE_C_FOUR_ROWS(20,21,22,23) + UPDATE_C_FOUR_ROWS(24,25,26,27) + UPDATE_C_FOUR_ROWS(28,29,30,31) + + JMP(END) + + LABEL(COLSTORBZ) + + UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11) + UPDATE_C_BZ_FOUR_ROWS(12,13,14,15) + UPDATE_C_BZ_FOUR_ROWS(16,17,18,19) + UPDATE_C_BZ_FOUR_ROWS(20,21,22,23) + UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) + UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) + + JMP(END) + + LABEL(SCATTEREDUPDATE) + + MOV(RDI, VAR(offsetPtr)) + VMOVAPS(ZMM(2), MEM(RDI)) + /* Note that this ignores the upper 32 bits in cs_c */ + VPBROADCASTD(ZMM(3), EBX) + VPMULLD(ZMM(2), ZMM(3), ZMM(2)) + + VMOVD(EDX, XMM(1)) + SAL1(EDX) //shift out sign bit + JZ(SCATTERBZ) + + UPDATE_C_ROW_SCATTERED( 8) + UPDATE_C_ROW_SCATTERED( 9) + UPDATE_C_ROW_SCATTERED(10) + UPDATE_C_ROW_SCATTERED(11) + UPDATE_C_ROW_SCATTERED(12) + UPDATE_C_ROW_SCATTERED(13) + UPDATE_C_ROW_SCATTERED(14) + UPDATE_C_ROW_SCATTERED(15) + UPDATE_C_ROW_SCATTERED(16) + UPDATE_C_ROW_SCATTERED(17) + UPDATE_C_ROW_SCATTERED(18) + UPDATE_C_ROW_SCATTERED(19) + UPDATE_C_ROW_SCATTERED(20) + UPDATE_C_ROW_SCATTERED(21) + UPDATE_C_ROW_SCATTERED(22) + UPDATE_C_ROW_SCATTERED(23) + UPDATE_C_ROW_SCATTERED(24) + UPDATE_C_ROW_SCATTERED(25) + UPDATE_C_ROW_SCATTERED(26) + UPDATE_C_ROW_SCATTERED(27) + UPDATE_C_ROW_SCATTERED(28) + UPDATE_C_ROW_SCATTERED(29) + UPDATE_C_ROW_SCATTERED(30) + UPDATE_C_ROW_SCATTERED(31) + + JMP(END) + + LABEL(SCATTERBZ) + + UPDATE_C_BZ_ROW_SCATTERED( 8) + UPDATE_C_BZ_ROW_SCATTERED( 9) + UPDATE_C_BZ_ROW_SCATTERED(10) + UPDATE_C_BZ_ROW_SCATTERED(11) + UPDATE_C_BZ_ROW_SCATTERED(12) + UPDATE_C_BZ_ROW_SCATTERED(13) + UPDATE_C_BZ_ROW_SCATTERED(14) + UPDATE_C_BZ_ROW_SCATTERED(15) + UPDATE_C_BZ_ROW_SCATTERED(16) + UPDATE_C_BZ_ROW_SCATTERED(17) + UPDATE_C_BZ_ROW_SCATTERED(18) + UPDATE_C_BZ_ROW_SCATTERED(19) + UPDATE_C_BZ_ROW_SCATTERED(20) + UPDATE_C_BZ_ROW_SCATTERED(21) + UPDATE_C_BZ_ROW_SCATTERED(22) + UPDATE_C_BZ_ROW_SCATTERED(23) + UPDATE_C_BZ_ROW_SCATTERED(24) + UPDATE_C_BZ_ROW_SCATTERED(25) + UPDATE_C_BZ_ROW_SCATTERED(26) + UPDATE_C_BZ_ROW_SCATTERED(27) + UPDATE_C_BZ_ROW_SCATTERED(28) + UPDATE_C_BZ_ROW_SCATTERED(29) + UPDATE_C_BZ_ROW_SCATTERED(30) + UPDATE_C_BZ_ROW_SCATTERED(31) + + LABEL(END) + +#ifdef MONITORS + RDTSC + MOV(VAR(botl), EAX) + MOV(VAR(both), EDX) +#endif + : // output operands +#ifdef MONITORS + [topl] "=m" (topl), + [toph] "=m" (toph), + [midl] "=m" (midl), + [midh] "=m" (midh), + [mid2l] "=m" (mid2l), + [mid2h] "=m" (mid2h), + [botl] "=m" (botl), + [both] "=m" (both) +#endif + : // input operands + [k] "m" (k), + [a] "m" (a), + [b] "m" (b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [a_next] "m" (a_next), + [b_next] "m" (b_next), + [offsetPtr] "m" (offsetPtr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", + "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", "memory" + ); + +#ifdef LOOPMON + printf("looptime = \t%d\n", bloopl - tloopl); +#endif +#ifdef MONITORS + dim_t top = ((dim_t)toph << 32) | topl; + dim_t mid = ((dim_t)midh << 32) | midl; + dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; + dim_t bot = ((dim_t)both << 32) | botl; + printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); +#endif +} diff --git a/kernels/knl/3/bli_dgemm_knl_asm_12x16.c b/kernels/knl/3/other/bli_dgemm_knl_asm_12x16.c similarity index 100% rename from kernels/knl/3/bli_dgemm_knl_asm_12x16.c rename to kernels/knl/3/other/bli_dgemm_knl_asm_12x16.c diff --git a/kernels/knl/3/bli_dgemm_knl_asm_30x8.c b/kernels/knl/3/other/bli_dgemm_knl_asm_30x8.c similarity index 100% rename from kernels/knl/3/bli_dgemm_knl_asm_30x8.c rename to kernels/knl/3/other/bli_dgemm_knl_asm_30x8.c diff --git a/kernels/knl/3/bli_dgemm_knl_asm_30x8_knc.c b/kernels/knl/3/other/bli_dgemm_knl_asm_30x8_knc.c similarity index 100% rename from kernels/knl/3/bli_dgemm_knl_asm_30x8_knc.c rename to kernels/knl/3/other/bli_dgemm_knl_asm_30x8_knc.c diff --git a/kernels/knl/3/bli_dgemm_knl_asm_8x24.c b/kernels/knl/3/other/bli_dgemm_knl_asm_8x24.c similarity index 100% rename from kernels/knl/3/bli_dgemm_knl_asm_8x24.c rename to kernels/knl/3/other/bli_dgemm_knl_asm_8x24.c diff --git a/kernels/knl/3/bli_sgemm_knl_asm_30x16_knc.c b/kernels/knl/3/other/bli_sgemm_knl_asm_30x16_knc.c similarity index 100% rename from kernels/knl/3/bli_sgemm_knl_asm_30x16_knc.c rename to kernels/knl/3/other/bli_sgemm_knl_asm_30x16_knc.c diff --git a/kernels/knl/bli_kernels_knl.h b/kernels/knl/bli_kernels_knl.h index 15f34af77..457d59fab 100644 --- a/kernels/knl/bli_kernels_knl.h +++ b/kernels/knl/bli_kernels_knl.h @@ -32,13 +32,19 @@ */ -GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) +GEMM_UKR_PROT( double, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) -GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) -GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) + +PACKM_KER_PROT( double, s, packm_knl_asm_24xk ) +PACKM_KER_PROT( double, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) +// unused: +GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) +GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) +GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) + PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) diff --git a/kernels/skx/3/bli_avx512_macros.h b/kernels/skx/3/bli_avx512_macros.h deleted file mode 100644 index 5cc45200a..000000000 --- a/kernels/skx/3/bli_avx512_macros.h +++ /dev/null @@ -1,173 +0,0 @@ -#ifndef BLIS_AVX512_MACROS_H -#define BLIS_AVX512_MACROS_H - -// -// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful -// - -#define COMMENT_BEGIN "#" -#define COMMENT_END - -#define STRINGIFY(...) #__VA_ARGS__ -#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t" -#define LABEL(label) STRINGIFY(label) ":\n\t" - -#define XMM(x) %%xmm##x -#define YMM(x) %%ymm##x -#define ZMM(x) %%zmm##x -#define EAX %%eax -#define EBX %%ebx -#define ECX %%ecx -#define EDX %%edx -#define EBP %%ebp -#define EDI %%edi -#define ESI %%esi -#define RAX %%rax -#define RBX %%rbx -#define RCX %%rcx -#define RDX %%rdx -#define RBP %%rbp -#define RDI %%rdi -#define RSI %%rsi -#define K(x) %%k##x -#define R(x) %%r##x -#define R8 %%r8 -#define R9 %%r9 -#define R10 %%r10 -#define R11 %%r11 -#define R12 %%r12 -#define R13 %%r13 -#define R14 %%r14 -#define R15 %%r15 -#define RD(x) %%r##x##d -#define R8D %%r8d -#define R9D %%r9d -#define R10D %%r10d -#define R11D %%r11d -#define R12D %%r12d -#define R13D %%r13d -#define R14D %%r14d -#define R15D %%r15d -#define IMM(x) $##x -#define VAR(x) %[x] - -#define MEM_4(reg,off,scale,disp) disp(reg,off,scale) -#define MEM_3(reg,off,scale) (reg,off,scale) -#define MEM_2(reg,disp) disp(reg) -#define MEM_1(reg) (reg) - -#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%} -#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%} -#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%} -#define MEM_1TO8_1(reg) MEM(reg) %{1to8%} - -#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%} -#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%} -#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%} -#define MEM_1TO16_1(reg) MEM(reg) %{1to16%} - -#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME -#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__) -#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__) -#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__) - -#define MASK_K(n) %{%%k##n%} -#define MASK_KZ(n) %{%%k##n%}%{z%} -#define KMOV(to,from) ASM(kmovw from, to) -#define JKNZD(kreg,label) \ - ASM(kortestw kreg, kreg) \ - ASM(jnz label) -#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0) -#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0) - -#define ALIGN16 ASM(.p2align 4) -#define ALIGN32 ASM(.p2align 5) -#define RDTSC ASM(rdstc) -#define MOV(_0, _1) ASM(mov _1, _0) -#define MOVD(_0, _1) ASM(movd _1, _0) -#define MOVL(_0, _1) ASM(movl _1, _0) -#define MOVQ(_0, _1) ASM(movq _1, _0) -#define VMOVD(_0, _1) ASM(vmovd _1, _0) -#define VMOVQ(_0, _1) ASM(vmovq _1, _0) -#define CMP(_0, _1) ASM(cmp _1, _0) -#define AND(_0, _1) ASM(and _1, _0) -#define ADD(_0, _1) ASM(add _1, _0) -#define SUB(_0, _1) ASM(sub _1, _0) -#define SAL(_0, _1) ASM(sal _1, _0) -#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0) -#define SAR(_0, _1) ASM(sar _1, _0) -#define SAL1(_0) ASM(sal _0) -#define SAR1(_0) ASM(sar _0) -#define LEA(_0, _1) ASM(lea _1, _0) -#define TEST(_0, _1) ASM(test _1, _0) -#define DEC(_0) ASM(dec _0) -#define JLE(_0) ASM(jle _0) -#define JL(_0) ASM(jl _0) -#define JNZ(_0) ASM(jnz _0) -#define JZ(_0) ASM(jz _0) -#define JNE(_0) ASM(jne _0) -#define JE(_0) ASM(je _0) -#define JNC(_0) ASM(jnc _0) -#define JC(_0) ASM(jc _0) -#define JMP(_0) ASM(jmp _0) -#define VCOMISS(_0, _1) ASM(vcomiss _1, _0) -#define VCOMISD(_0, _1) ASM(vcomisd _1, _0) -#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0) -#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0) -#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0) -#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0) -#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0) -#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0) -#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0) -#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0) -#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0) -#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0) -#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0) -#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0) -#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0) -#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0) -#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0) -#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0) -#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0) -#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0) -#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0) -#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0) -#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0) -#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0) -#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0) -#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0) -#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0) -#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0) -#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0) -#define VMOVSS(_0, _1) ASM(vmovss _1, _0) -#define VMOVSD(_0, _1) ASM(vmovsd _1, _0) -#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0) -#define VMOVUPS(_0, _1) ASM(vmovups _1, _0) -#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0) -#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0) -#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0) -#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0) -#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0) -#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0) -#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0) -#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0) -#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0) -#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0) -#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0) -#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0) -#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0) -#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0) -#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0) -#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0) -#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0) -#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0) -#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS) -#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS) -#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS) -#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS) -#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS) -#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS) -#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS) -#define VZEROUPPER() ASM(vzeroupper) - -#endif diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c index b219e8c33..d9c524059 100644 --- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -33,7 +33,6 @@ */ #include "blis.h" -#include #include "bli_avx512_macros.h" diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 1367e2334..e5aa84dfa 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -214,23 +214,17 @@ PDEF_MT := -DP_BEGIN=200 \ # --- Targets/rules ------------------------------------------------------------ # -all: blis-all openblas-all +all-st: blis-st openblas-st mkl-st +all-mt: blis-mt openblas-mt mkl-mt -intel: blis-all openblas-all mkl-all +blis-st: blis-gemm-st +blis-mt: blis-gemm-mt -amd: blis-all openblas-all acml-all +openblas-st: openblas-gemm-st +openblas-mt: openblas-gemm-mt -blis-all: blis-gemm-st \ - blis-gemm-mt - -openblas-all: openblas-gemm-st \ - openblas-gemm-mt - -mkl-all: mkl-gemm-st \ - mkl-gemm-mt - -acml-all: acml-gemm-st \ - acml-gemm-mt +mkl-st: mkl-gemm-st +mkl-mt: mkl-gemm-mt blis-gemm-st: \ test_sgemm_asm_blis_st.x \ @@ -294,18 +288,6 @@ mkl-gemm-mt: \ test_cgemm_mkl_mt.x \ test_zgemm_mkl_mt.x -acml-gemm-st: \ - test_sgemm_acml_st.x \ - test_dgemm_acml_st.x \ - test_cgemm_acml_st.x \ - test_zgemm_acml_st.x - -acml-gemm-mt: \ - test_sgemm_acml_mt.x \ - test_dgemm_acml_mt.x \ - test_cgemm_acml_mt.x \ - test_zgemm_acml_mt.x - # --Object file rules -- @@ -466,31 +448,6 @@ test_z%_mkl_mt.o: test_%.c test_c%_mkl_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ -# acml -test_d%_acml_st.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ - -test_s%_acml_st.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ - -test_z%_acml_st.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ - -test_c%_acml_st.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ - -test_d%_acml_mt.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ - -test_s%_acml_mt.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ - -test_z%_acml_mt.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ - -test_c%_acml_mt.o: test_%.c - $(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ - # -- Executable file rules -- @@ -511,12 +468,6 @@ test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK) test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK) $(LINKER) $< $(MKLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ -test_%_acml_st.x: test_%_acml_st.o $(LIBBLIS_LINK) - $(LINKER) $< $(ACML_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - -test_%_acml_mt.x: test_%_acml_mt.o $(LIBBLIS_LINK) - $(LINKER) $< $(ACMLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@ - test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK) $(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@