Updates to knl kernels and related code.

Details:
- Imported the 24x16 knl sgemm microkernel (and its corresonding spackm
  kernel) from TBLIS and enabled its use in the knl sub-config. Also
  Added sgemm microkernel prototype to bli_kernels_knl.h.
- Updated dgemm and dpackm microkernels from TBLIS, which included an
  important change regarding the offsets array (changed from extern
  declaration to static declaration/definition).
- Activated use of level-1v and -1f zen kernels in skx and knl
  sub-configs.
- Removed some old macros no longer needed in bli_family_skx.h now that
  libmemkind support exists in configure.
- Moved bli_avx512_macros.h to frame/include and adjusted #includes in
  skx and knl kernels accordingly.
- Moved unused kernels in kernels/knl/3 to kernels/knl/3/other
  directory.
- Fixed a minor bug in the 'make' output per compile when verboseness
  is not turned on. The rule-generating function 'make-kernel-rule' was
  previously passing in the name of the config, rather than the name of
  the kernel set returned by get-config-for-kset, which could give
  misleading information to the user when the kconfig_map mapped a
  kernel set to a sub-configuration that did not share the same name.
  (This didn't affect the CFLAGS that were actually used.)
- Updated test/3m4m/Makefile, removing acml targets and renaming the
  remaining targets.
This commit is contained in:
Field G. Van Zee
2018-04-16 18:46:21 -05:00
parent 2b7108a8ef
commit 60366a3fab
22 changed files with 1426 additions and 267 deletions

View File

@@ -537,7 +537,7 @@ $(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@
else
@echo "Compiling $$@" $(call get-kernel-text-for,$(1))
@echo "Compiling $$@" $(call get-kernel-text-for,$(2))
@$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@
endif
endef

View File

@@ -47,8 +47,9 @@ void bli_cntx_init_knl( cntx_t* cntx )
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
cntx
);
@@ -61,26 +62,77 @@ void bli_cntx_init_knl( cntx_t* cntx )
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 24, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_MC ], -1, 120, -1, -1,
-1, 144, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_KC ], -1, 336, -1, -1,
-1, 420, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 14400, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 24, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_MC ], 240, 120, -1, -1,
288, 144, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 336, 336, -1, -1,
408, 408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 14400, 14400, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
}

View File

@@ -48,8 +48,54 @@ void bli_cntx_init_skx( cntx_t* cntx )
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
@@ -59,19 +105,25 @@ void bli_cntx_init_skx( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256,
480, 480, 256, 256 );
480, 480, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
}

View File

@@ -50,10 +50,10 @@
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
#include <stdlib.h>
//#include <stdlib.h>
#define BLIS_MALLOC_POOL malloc
#define BLIS_FREE_POOL free
//#define BLIS_MALLOC_POOL malloc
//#define BLIS_FREE_POOL free
#if 0

View File

@@ -62,6 +62,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
@@ -115,8 +116,8 @@ void bli_cntx_init_zen( cntx_t* cntx )
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.

View File

@@ -15,8 +15,8 @@ arm64: cortexa57 generic
arm32: cortexa15 cortexa9 generic
# Intel architectures.
skx: skx
knl: knl
skx: skx/skx/zen
knl: knl/knl/zen
haswell: haswell/haswell/zen
sandybridge: sandybridge
penryn: penryn

2
configure vendored
View File

@@ -614,7 +614,7 @@ build_kconfig_registry()
for config in ${clist}; do
# Look up the kernel for the current sub-configuration.
# Look up the kernels for the current sub-configuration.
#kernels="${kernel_registry[${config}]}"
kernels=$(query_array "kernel_registry" ${config})

View File

@@ -32,7 +32,7 @@
*/
#include "../3/bli_avx512_macros.h"
#include "bli_avx512_macros.h"
#include "blis.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
@@ -100,7 +100,9 @@
VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD))
//This is an array used for the scatter/gather instructions.
extern int32_t offsets[24];
static int32_t offsets[32] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
void bli_dpackm_knl_asm_8xk
(

View File

@@ -0,0 +1,563 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_avx512_macros.h"
#include "blis.h"
#include <stdio.h>
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
z0,z1,z2,z3,z4,z5,z6,z7) \
\
VMULPS(YMM(z0), YMM(15), MEM(a, o)) \
VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \
VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \
VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \
VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \
VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \
VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \
VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o))
#define STORE8x8(a,o,s, \
z0,z1,z2,z3,z4,z5,z6,z7) \
\
VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \
VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \
VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \
VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \
VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \
VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \
VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \
VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7))
#define STORETRANS8x8(a,o,s, \
a0,a1,a2,a3,a4,a5,a6,a7, \
t0,t1,t2,t3,t4,t5) \
\
VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \
VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \
VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \
VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \
\
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
VMOVUPS(MEM(a,(o )+0*(s)), XMM(t4)) \
VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \
VEXTRACTF128(MEM(a,(o )+4*(s)), YMM(t4), IMM(1)) \
VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \
\
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
VMOVUPS(MEM(a,(o )+1*(s)), XMM(t4)) \
VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \
VEXTRACTF128(MEM(a,(o )+5*(s)), YMM(t4), IMM(1)) \
VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \
\
VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \
VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \
VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \
VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \
\
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
VMOVUPS(MEM(a,(o )+2*(s)), XMM(t4)) \
VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \
VEXTRACTF128(MEM(a,(o )+6*(s)), YMM(t4), IMM(1)) \
VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \
\
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
VMOVUPS(MEM(a,(o )+3*(s)), XMM(t4)) \
VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \
VEXTRACTF128(MEM(a,(o )+7*(s)), YMM(t4), IMM(1)) \
VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1))
//This is an array used for the scatter/gather instructions.
static int32_t offsets[32] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
void bli_spackm_knl_asm_16xk
(
conj_t conja,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_,
cntx_t* restrict ctnx
)
{
(void)conja;
const int32_t * offsetPtr = &offsets[0];
float* a = (float*)a_;
float* p = (float*)p_;
float* kappa = (float*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
TEST(RSI, RSI)
JZ(PACK16_DONE)
LEA(RBX, MEM(,RBX,4)) //inca in bytes
LEA(RCX, MEM(,RCX,4)) //lda in bytes
VBROADCASTSS(YMM(15), VAR(kappa))
CMP(RBX, IMM(4))
JNE(PACK16_T)
LABEL(PACK16_N)
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK16_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
LABEL(PACK16_N_LOOP)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,16*8*4))
SUB(RSI, IMM(1))
JNZ(PACK16_N_LOOP)
TEST(RDX, RDX)
JZ(PACK16_DONE)
LABEL(PACK16_N_TAIL)
VMULPS(YMM(0), YMM(15), MEM(RAX ))
VMULPS(YMM(1), YMM(15), MEM(RAX,32))
VMOVUPS(MEM(R14 ), YMM(0))
VMOVUPS(MEM(R14,32), YMM(1))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14, 16*4))
SUB(RDX, IMM(1))
JNZ(PACK16_N_TAIL)
JMP(PACK16_DONE)
LABEL(PACK16_T)
CMP(RCX, IMM(4))
JNE(PACK16_G)
LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
LEA(R11, MEM(RAX,RBX,8))
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK16_T_TAIL)
LABEL(PACK16_T_LOOP)
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
LEA(RAX, MEM(RAX, 8*4))
LEA(R11, MEM(R11, 8*4))
LEA(R14, MEM(R14,16*8*4))
SUB(RSI, IMM(1))
JNZ(PACK16_T_LOOP)
TEST(RDX, RDX)
JZ(PACK16_DONE)
LABEL(PACK16_T_TAIL)
VMULSS(XMM(0), XMM(15), MEM(RAX ))
VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
VMOVSS(MEM(R14,0*4), XMM(0))
VMOVSS(MEM(R14,1*4), XMM(1))
VMOVSS(MEM(R14,2*4), XMM(2))
VMOVSS(MEM(R14,3*4), XMM(3))
VMOVSS(MEM(R14,4*4), XMM(4))
VMOVSS(MEM(R14,5*4), XMM(5))
VMOVSS(MEM(R14,6*4), XMM(6))
VMOVSS(MEM(R14,7*4), XMM(7))
VMULSS(XMM(0), XMM(15), MEM(R11 ))
VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
VMOVSS(MEM(R14, 8*4), XMM(0))
VMOVSS(MEM(R14, 9*4), XMM(1))
VMOVSS(MEM(R14,10*4), XMM(2))
VMOVSS(MEM(R14,11*4), XMM(3))
VMOVSS(MEM(R14,12*4), XMM(4))
VMOVSS(MEM(R14,13*4), XMM(5))
VMOVSS(MEM(R14,14*4), XMM(6))
VMOVSS(MEM(R14,15*4), XMM(7))
LEA(RAX, MEM(RAX, 4))
LEA(R11, MEM(R11, 4))
LEA(R14, MEM(R14,16*4))
SUB(RDX, IMM(1))
JNZ(PACK16_T_TAIL)
JMP(PACK16_DONE)
LABEL(PACK16_G)
VPBROADCASTD(ZMM(3), VAR(inca))
MOV(RBX, VAR(offsetPtr))
VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
LABEL(PACK16_G_LOOP)
KXNORW(K(1), K(0), K(0))
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
VMULPS(ZMM(3), ZMM(3), ZMM(15))
VMOVUPS(MEM(R14), ZMM(3))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14, 16*4))
SUB(RSI, IMM(1))
JNZ(PACK16_G_LOOP)
LABEL(PACK16_DONE)
: //output operands
: //input operands
[n] "m" (n),
[kappa] "m" (*kappa),
[a] "m" (a),
[inca] "m" (inca),
[lda] "m" (lda),
[p] "m" (p),
[ldp] "m" (ldp),
[offsetPtr] "m" (offsetPtr)
: //clobbers
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
"zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
);
}
void bli_spackm_knl_asm_24xk
(
conj_t conja,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_,
cntx_t* restrict ctnx
)
{
(void)conja;
const int32_t * offsetPtr = &offsets[0];
float* a = (float*)a_;
float* p = (float*)p_;
float* kappa = (float*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;
__asm__ volatile
(
MOV(RSI, VAR(n))
MOV(RAX, VAR(a))
MOV(RBX, VAR(inca))
MOV(RCX, VAR(lda))
MOV(R14, VAR(p))
MOV(RDI, VAR(ldp))
TEST(RSI, RSI)
JZ(PACK24_DONE)
LEA(RBX, MEM(,RBX,4)) //inca in bytes
LEA(RCX, MEM(,RCX,4)) //lda in bytes
LEA(RDI, MEM(,RDI,4)) //ldp in bytes
VBROADCASTSS(ZMM(15), VAR(kappa))
CMP(RBX, IMM(4))
JNE(PACK24_T)
LABEL(PACK24_N)
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK24_N_TAIL)
LEA(R8, MEM(RCX,RCX,2)) //lda*3
LEA(R9, MEM(RCX,RCX,4)) //lda*5
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
LABEL(PACK24_N_LOOP)
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK24_N_LOOP)
TEST(RDX, RDX)
JZ(PACK24_DONE)
LABEL(PACK24_N_TAIL)
VMULPS(ZMM(0), ZMM(15), MEM(RAX))
VMOVUPS(MEM(R14), ZMM(0))
VMULPS(YMM(1), YMM(15), MEM(RAX,64))
VMOVUPS(MEM(R14,64), YMM(1))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RDX, IMM(1))
JNZ(PACK24_N_TAIL)
JMP(PACK24_DONE)
LABEL(PACK24_T)
CMP(RCX, IMM(4))
JNE(PACK24_G)
LEA(R8, MEM(RBX,RBX,2)) //inca*3
LEA(R9, MEM(RBX,RBX,4)) //inca*5
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
LEA(R11, MEM(RAX,RBX,8))
LEA(R12, MEM(R11,RBX,8))
MOV(RDX, RSI)
AND(RDX, IMM(7))
SAR(RSI, IMM(3))
JZ(PACK24_T_TAIL)
LABEL(PACK24_T_LOOP)
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
LEA(RAX, MEM(RAX,RCX,8))
LEA(R11, MEM(R11,RCX,8))
LEA(R12, MEM(R12,RCX,8))
LEA(R14, MEM(R14,RDI,8))
SUB(RSI, IMM(1))
JNZ(PACK24_T_LOOP)
TEST(RDX, RDX)
JZ(PACK24_DONE)
LABEL(PACK24_T_TAIL)
VMULSS(XMM(0), XMM(15), MEM(RAX))
VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
VMOVSS(MEM(R14,0*4), XMM(0))
VMOVSS(MEM(R14,1*4), XMM(1))
VMOVSS(MEM(R14,2*4), XMM(2))
VMOVSS(MEM(R14,3*4), XMM(3))
VMOVSS(MEM(R14,4*4), XMM(4))
VMOVSS(MEM(R14,5*4), XMM(5))
VMOVSS(MEM(R14,6*4), XMM(6))
VMOVSS(MEM(R14,7*4), XMM(7))
VMULSS(XMM(0), XMM(15), MEM(R11))
VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
VMOVSS(MEM(R14, 8*4), XMM(0))
VMOVSS(MEM(R14, 9*4), XMM(1))
VMOVSS(MEM(R14,10*4), XMM(2))
VMOVSS(MEM(R14,11*4), XMM(3))
VMOVSS(MEM(R14,12*4), XMM(4))
VMOVSS(MEM(R14,13*4), XMM(5))
VMOVSS(MEM(R14,14*4), XMM(6))
VMOVSS(MEM(R14,15*4), XMM(7))
VMULSS(XMM(0), XMM(15), MEM(R12))
VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
VMOVSS(MEM(R14,16*4), XMM(0))
VMOVSS(MEM(R14,17*4), XMM(1))
VMOVSS(MEM(R14,18*4), XMM(2))
VMOVSS(MEM(R14,19*4), XMM(3))
VMOVSS(MEM(R14,20*4), XMM(4))
VMOVSS(MEM(R14,21*4), XMM(5))
VMOVSS(MEM(R14,22*4), XMM(6))
VMOVSS(MEM(R14,23*4), XMM(7))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R11, MEM(R11,RCX,1))
LEA(R12, MEM(R12,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RDX, IMM(1))
JNZ(PACK24_T_TAIL)
JMP(PACK24_DONE)
LABEL(PACK24_G)
VPBROADCASTD(ZMM(3), VAR(inca))
MOV(RBX, VAR(offsetPtr))
VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
LEA(R11, MEM(RAX,RBX,8))
LEA(R11, MEM(R11,RBX,8))
LABEL(PACK24_G_LOOP)
KXNORW(K(1), K(0), K(0))
KSHIFTRW(K(2), K(1), IMM(8))
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
VMULPS(ZMM(3), ZMM(3), ZMM(15))
VMULPS(YMM(4), YMM(4), YMM(15))
VMOVUPS(MEM(R14), ZMM(3))
VMOVUPS(MEM(R14,64), YMM(4))
LEA(RAX, MEM(RAX,RCX,1))
LEA(R14, MEM(R14,RDI,1))
SUB(RSI, IMM(1))
JNZ(PACK24_G_LOOP)
LABEL(PACK24_DONE)
: //output operands
: //input operands
[n] "m" (n),
[kappa] "m" (*kappa),
[a] "m" (a),
[inca] "m" (inca),
[lda] "m" (lda),
[p] "m" (p),
[ldp] "m" (ldp),
[offsetPtr] "m" (offsetPtr)
: //clobbers
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
"zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31",
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
);
}

View File

@@ -32,7 +32,7 @@
*/
#include "../3/bli_avx512_macros.h"
#include "bli_avx512_macros.h"
#include "blis.h"
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \

View File

@@ -176,7 +176,9 @@
PREFETCH_B_L2(n)
//This is an array used for the scatter/gather instructions.
extern int32_t offsets[24];
static int32_t offsets[32] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
//#define MONITORS
//#define LOOPMON
@@ -224,7 +226,7 @@ void bli_dgemm_knl_asm_24x8
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
VMOVAPS(ZMM(14), ZMM(8))
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
#if SCATTER_PREFETCH_C
@@ -259,11 +261,6 @@ void bli_dgemm_knl_asm_24x8
MOV(VAR(midh), EDX)
#endif
TEST(RSI, RSI)
JZ(POSTACCUM)
VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
SUB(RSI, IMM(32))
JLE(TAIL)
@@ -535,6 +532,7 @@ void bli_dgemm_knl_asm_24x8
MOV(RDX, RCX)
ADD(RSI, IMM(32))
JZ(POSTACCUM)
LABEL(TAIL_LOOP)

View File

@@ -0,0 +1,708 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
#include "bli_avx512_macros.h"
#define UNROLL_K 32
#define SCATTER_PREFETCH_C 1
#define PREFETCH_A_L2 0
#define PREFETCH_B_L2 0
#define L2_PREFETCH_DIST 64
#define A_L1_PREFETCH_DIST 36
#define B_L1_PREFETCH_DIST 18
#define LOOP_ALIGN ALIGN16
#define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
\
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX )) \
VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
VMOVUPS(MEM(RCX ), ZMM(R1)) \
VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,4))
#define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
\
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
VMOVUPS(MEM(RCX ), ZMM(R1)) \
VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,4))
#define UPDATE_C_ROW_SCATTERED(NUM) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \
VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \
VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \
ADD(RCX, RAX)
#define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
\
KXNORW(K(1), K(0), K(0)) \
VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \
ADD(RCX, RAX)
#define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4))
#define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64))
#if PREFETCH_A_L2
#undef PREFETCH_A_L2
#define PREFETCH_A_L2(n) \
\
PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \
PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64))
#else
#undef PREFETCH_A_L2
#define PREFETCH_A_L2(...)
#endif
#define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4))
#if PREFETCH_B_L2
#undef PREFETCH_B_L2
#define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4))
#else
#undef PREFETCH_B_L2
#define PREFETCH_B_L2(...)
#endif
#define PREFETCH_C_L1_1
#define PREFETCH_C_L1_2
#define PREFETCH_C_L1_3
//
// n: index in unrolled loop
//
// a: ZMM register to load into
// b: ZMM register to read from
//
// ...: addressing for A, except for offset
//
#define SUBITER(n,a,b,...) \
\
PREFETCH_A_L2(n) \
\
VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \
VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \
VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \
VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \
PREFETCH_A_L1_1(n) \
VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \
VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \
VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \
PREFETCH_C_L1_1 \
VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \
VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \
VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \
PREFETCH_A_L1_2(n) \
VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \
VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \
VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \
PREFETCH_C_L1_2 \
VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \
VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \
VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \
PREFETCH_C_L1_3 \
VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \
VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \
VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \
PREFETCH_B_L1(n) \
VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \
VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \
VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \
PREFETCH_B_L2(n) \
VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \
VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \
VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4))
//This is an array used for the scatter/gather instructions.
static int32_t offsets[32] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
//#define MONITORS
//#define LOOPMON
void bli_sgemm_knl_asm_24x16
(
dim_t k_,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;
const double * a_next = bli_auxinfo_next_a( data );
const double * b_next = bli_auxinfo_next_b( data );
const int32_t * offsetPtr = &offsets[0];
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
#endif
#ifdef LOOPMON
int tlooph, tloopl, blooph, bloopl;
#endif
__asm__ volatile
(
#ifdef MONITORS
RDTSC
MOV(VAR(topl), EAX)
MOV(VAR(toph), EDX)
#endif
VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c))
VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
#if SCATTER_PREFETCH_C
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8))
VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
VMOVAPS(ZMM(20), ZMM(8))
VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
#else
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
VMOVAPS(ZMM(21), ZMM(8))
VMOVAPS(ZMM(22), ZMM(8))
VMOVAPS(ZMM(23), ZMM(8))
#endif
VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(2))
VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*4)) //offset for 4 iterations
VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5
VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7
VMOVAPS(ZMM(29), ZMM(8))
VMOVAPS(ZMM(30), ZMM(8))
VMOVAPS(ZMM(31), ZMM(8))
#ifdef MONITORS
RDTSC
MOV(VAR(midl), EAX)
MOV(VAR(midh), EDX)
#endif
SUB(RSI, IMM(32))
JLE(TAIL)
//prefetch C into L2
#if SCATTER_PREFETCH_C
ADD(RSI, IMM(24))
KXNORW(K(1), K(0), K(0))
KXNORW(K(2), K(0), K(0))
VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
#else
PREFETCHW1(MEM(RCX ))
SUBITER( 0,1,0,RAX )
PREFETCHW1(MEM(RCX,R12,1))
SUBITER( 1,0,1,RAX )
PREFETCHW1(MEM(RCX,R12,2))
SUBITER( 2,1,0,RAX )
PREFETCHW1(MEM(RCX,R13,1))
SUBITER( 3,0,1,RAX )
PREFETCHW1(MEM(RCX,R12,4))
SUBITER( 4,1,0,RAX,R8, 1)
PREFETCHW1(MEM(RCX,R14,1))
SUBITER( 5,0,1,RAX,R8, 1)
PREFETCHW1(MEM(RCX,R13,2))
SUBITER( 6,1,0,RAX,R8, 1)
PREFETCHW1(MEM(RCX,R15,1))
SUBITER( 7,0,1,RAX,R8, 1)
LEA(RDX, MEM(RCX,R12,8))
PREFETCHW1(MEM(RDX ))
SUBITER( 8,1,0,RAX,R8, 2)
PREFETCHW1(MEM(RDX,R12,1))
SUBITER( 9,0,1,RAX,R8, 2)
PREFETCHW1(MEM(RDX,R12,2))
SUBITER(10,1,0,RAX,R8, 2)
PREFETCHW1(MEM(RDX,R13,1))
SUBITER(11,0,1,RAX,R8, 2)
PREFETCHW1(MEM(RDX,R12,4))
SUBITER(12,1,0,RAX,R9, 1)
PREFETCHW1(MEM(RDX,R14,1))
SUBITER(13,0,1,RAX,R9, 1)
PREFETCHW1(MEM(RDX,R13,2))
SUBITER(14,1,0,RAX,R9, 1)
PREFETCHW1(MEM(RDX,R15,1))
SUBITER(15,0,1,RAX,R9, 1)
LEA(RDI, MEM(RDX,R12,8))
PREFETCHW1(MEM(RDI ))
SUBITER(16,1,0,RAX,R8, 4)
PREFETCHW1(MEM(RDI,R12,1))
SUBITER(17,0,1,RAX,R8, 4)
PREFETCHW1(MEM(RDI,R12,2))
SUBITER(18,1,0,RAX,R8, 4)
PREFETCHW1(MEM(RDI,R13,1))
SUBITER(19,0,1,RAX,R8, 4)
PREFETCHW1(MEM(RDI,R12,4))
SUBITER(20,1,0,RAX,R10,1)
PREFETCHW1(MEM(RDI,R14,1))
SUBITER(21,0,1,RAX,R10,1)
PREFETCHW1(MEM(RDI,R13,2))
SUBITER(22,1,0,RAX,R10,1)
PREFETCHW1(MEM(RDI,R15,1))
SUBITER(23,0,1,RAX,R10,1)
ADD(RAX, IMM(24*24*4))
ADD(RBX, IMM(24*16*4))
#endif
MOV(RDI, RSI)
AND(RDI, IMM(31))
SAR(RSI, IMM(5))
JZ(REM_1)
LOOP_ALIGN
LABEL(MAIN_LOOP)
SUBITER( 0,1,0,RAX )
SUBITER( 1,0,1,RAX )
SUBITER( 2,1,0,RAX )
SUBITER( 3,0,1,RAX )
SUBITER( 4,1,0,RAX,R8, 1)
SUBITER( 5,0,1,RAX,R8, 1)
SUBITER( 6,1,0,RAX,R8, 1)
SUBITER( 7,0,1,RAX,R8, 1)
SUBITER( 8,1,0,RAX,R8, 2)
SUBITER( 9,0,1,RAX,R8, 2)
SUBITER(10,1,0,RAX,R8, 2)
SUBITER(11,0,1,RAX,R8, 2)
SUBITER(12,1,0,RAX,R9, 1)
SUBITER(13,0,1,RAX,R9, 1)
SUBITER(14,1,0,RAX,R9, 1)
SUBITER(15,0,1,RAX,R9, 1)
SUBITER(16,1,0,RAX,R8, 4)
SUBITER(17,0,1,RAX,R8, 4)
SUBITER(18,1,0,RAX,R8, 4)
SUBITER(19,0,1,RAX,R8, 4)
SUBITER(20,1,0,RAX,R10,1)
SUBITER(21,0,1,RAX,R10,1)
SUBITER(22,1,0,RAX,R10,1)
SUBITER(23,0,1,RAX,R10,1)
SUBITER(24,1,0,RAX,R9, 2)
SUBITER(25,0,1,RAX,R9, 2)
SUBITER(26,1,0,RAX,R9, 2)
SUBITER(27,0,1,RAX,R9, 2)
SUBITER(28,1,0,RAX,R11,1)
SUBITER(29,0,1,RAX,R11,1)
SUBITER(30,1,0,RAX,R11,1)
SUBITER(31,0,1,RAX,R11,1)
ADD(RAX, IMM(32*24*4))
ADD(RBX, IMM(32*16*4))
SUB(RSI, IMM(1))
JNZ(MAIN_LOOP)
LABEL(REM_1)
SAR1(RDI)
JNC(REM_2)
SUBITER(0,1,0,RAX)
VMOVAPD(ZMM(0), ZMM(1))
ADD(RAX, IMM(24*4))
ADD(RBX, IMM(16*4))
LABEL(REM_2)
SAR1(RDI)
JNC(REM_4)
SUBITER(0,1,0,RAX)
SUBITER(1,0,1,RAX)
ADD(RAX, IMM(2*24*4))
ADD(RBX, IMM(2*16*4))
LABEL(REM_4)
SAR1(RDI)
JNC(REM_8)
SUBITER(0,1,0,RAX)
SUBITER(1,0,1,RAX)
SUBITER(2,1,0,RAX)
SUBITER(3,0,1,RAX)
ADD(RAX, IMM(4*24*4))
ADD(RBX, IMM(4*16*4))
LABEL(REM_8)
SAR1(RDI)
JNC(REM_16)
SUBITER(0,1,0,RAX )
SUBITER(1,0,1,RAX )
SUBITER(2,1,0,RAX )
SUBITER(3,0,1,RAX )
SUBITER(4,1,0,RAX,R8,1)
SUBITER(5,0,1,RAX,R8,1)
SUBITER(6,1,0,RAX,R8,1)
SUBITER(7,0,1,RAX,R8,1)
ADD(RAX, IMM(8*24*4))
ADD(RBX, IMM(8*16*4))
LABEL(REM_16)
SAR1(RDI)
JNC(AFTER_LOOP)
SUBITER( 0,1,0,RAX )
SUBITER( 1,0,1,RAX )
SUBITER( 2,1,0,RAX )
SUBITER( 3,0,1,RAX )
SUBITER( 4,1,0,RAX,R8, 1)
SUBITER( 5,0,1,RAX,R8, 1)
SUBITER( 6,1,0,RAX,R8, 1)
SUBITER( 7,0,1,RAX,R8, 1)
SUBITER( 8,1,0,RAX,R8, 2)
SUBITER( 9,0,1,RAX,R8, 2)
SUBITER(10,1,0,RAX,R8, 2)
SUBITER(11,0,1,RAX,R8, 2)
SUBITER(12,1,0,RAX,R9, 1)
SUBITER(13,0,1,RAX,R9, 1)
SUBITER(14,1,0,RAX,R9, 1)
SUBITER(15,0,1,RAX,R9, 1)
ADD(RAX, IMM(16*24*4))
ADD(RBX, IMM(16*16*4))
LABEL(AFTER_LOOP)
//prefetch C into L1
#if SCATTER_PREFETCH_C
KXNORW(K(1), K(0), K(0))
KXNORW(K(2), K(0), K(0))
VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
SUBITER(0,1,0,RAX )
SUBITER(1,0,1,RAX )
SUBITER(2,1,0,RAX )
SUBITER(3,0,1,RAX )
SUBITER(4,1,0,RAX,R8,1)
SUBITER(5,0,1,RAX,R8,1)
SUBITER(6,1,0,RAX,R8,1)
SUBITER(7,0,1,RAX,R8,1)
#else
LEA(RDX, MEM(RCX,R12,8))
LEA(RDI, MEM(RDX,R12,8))
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX ))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
SUBITER(0,1,0,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
SUBITER(1,0,1,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX ))
SUBITER(2,1,0,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
SUBITER(3,0,1,RAX )
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
SUBITER(4,1,0,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI ))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
SUBITER(5,0,1,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
SUBITER(6,1,0,RAX,R8,1)
#undef PREFETCH_C_L1_1
#undef PREFETCH_C_L1_2
#undef PREFETCH_C_L1_3
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
SUBITER(7,0,1,RAX,R8,1)
#endif
JMP(POSTACCUM)
LABEL(TAIL)
MOV(RDX, RCX)
ADD(RSI, IMM(32))
JZ(POSTACCUM)
LABEL(TAIL_LOOP)
PREFETCHW0(MEM(RDX))
ADD(RDX, R12)
SUBITER(0,1,0,RAX)
VMOVAPD(ZMM(0), ZMM(1))
ADD(RAX, IMM(24*4))
ADD(RBX, IMM(16*4))
SUB(RSI, IMM(1))
JNZ(TAIL_LOOP)
LABEL(POSTACCUM)
#ifdef MONITORS
RDTSC
MOV(VAR(mid2l), EAX)
MOV(VAR(mid2h), EDX)
#endif
MOV(RAX, VAR(alpha))
MOV(RBX, VAR(beta))
VBROADCASTSS(ZMM(0), MEM(RAX))
VBROADCASTSS(ZMM(1), MEM(RBX))
// Check if C is row stride. If not, jump to the slow scattered update
MOV(RAX, VAR(rs_c))
LEA(RAX, MEM(,RAX,4))
MOV(RBX, VAR(cs_c))
LEA(RDI, MEM(RAX,RAX,2))
CMP(RBX, IMM(1))
JNE(SCATTEREDUPDATE)
VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit
JZ(COLSTORBZ)
UPDATE_C_FOUR_ROWS( 8, 9,10,11)
UPDATE_C_FOUR_ROWS(12,13,14,15)
UPDATE_C_FOUR_ROWS(16,17,18,19)
UPDATE_C_FOUR_ROWS(20,21,22,23)
UPDATE_C_FOUR_ROWS(24,25,26,27)
UPDATE_C_FOUR_ROWS(28,29,30,31)
JMP(END)
LABEL(COLSTORBZ)
UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
JMP(END)
LABEL(SCATTEREDUPDATE)
MOV(RDI, VAR(offsetPtr))
VMOVAPS(ZMM(2), MEM(RDI))
/* Note that this ignores the upper 32 bits in cs_c */
VPBROADCASTD(ZMM(3), EBX)
VPMULLD(ZMM(2), ZMM(3), ZMM(2))
VMOVD(EDX, XMM(1))
SAL1(EDX) //shift out sign bit
JZ(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8)
UPDATE_C_ROW_SCATTERED( 9)
UPDATE_C_ROW_SCATTERED(10)
UPDATE_C_ROW_SCATTERED(11)
UPDATE_C_ROW_SCATTERED(12)
UPDATE_C_ROW_SCATTERED(13)
UPDATE_C_ROW_SCATTERED(14)
UPDATE_C_ROW_SCATTERED(15)
UPDATE_C_ROW_SCATTERED(16)
UPDATE_C_ROW_SCATTERED(17)
UPDATE_C_ROW_SCATTERED(18)
UPDATE_C_ROW_SCATTERED(19)
UPDATE_C_ROW_SCATTERED(20)
UPDATE_C_ROW_SCATTERED(21)
UPDATE_C_ROW_SCATTERED(22)
UPDATE_C_ROW_SCATTERED(23)
UPDATE_C_ROW_SCATTERED(24)
UPDATE_C_ROW_SCATTERED(25)
UPDATE_C_ROW_SCATTERED(26)
UPDATE_C_ROW_SCATTERED(27)
UPDATE_C_ROW_SCATTERED(28)
UPDATE_C_ROW_SCATTERED(29)
UPDATE_C_ROW_SCATTERED(30)
UPDATE_C_ROW_SCATTERED(31)
JMP(END)
LABEL(SCATTERBZ)
UPDATE_C_BZ_ROW_SCATTERED( 8)
UPDATE_C_BZ_ROW_SCATTERED( 9)
UPDATE_C_BZ_ROW_SCATTERED(10)
UPDATE_C_BZ_ROW_SCATTERED(11)
UPDATE_C_BZ_ROW_SCATTERED(12)
UPDATE_C_BZ_ROW_SCATTERED(13)
UPDATE_C_BZ_ROW_SCATTERED(14)
UPDATE_C_BZ_ROW_SCATTERED(15)
UPDATE_C_BZ_ROW_SCATTERED(16)
UPDATE_C_BZ_ROW_SCATTERED(17)
UPDATE_C_BZ_ROW_SCATTERED(18)
UPDATE_C_BZ_ROW_SCATTERED(19)
UPDATE_C_BZ_ROW_SCATTERED(20)
UPDATE_C_BZ_ROW_SCATTERED(21)
UPDATE_C_BZ_ROW_SCATTERED(22)
UPDATE_C_BZ_ROW_SCATTERED(23)
UPDATE_C_BZ_ROW_SCATTERED(24)
UPDATE_C_BZ_ROW_SCATTERED(25)
UPDATE_C_BZ_ROW_SCATTERED(26)
UPDATE_C_BZ_ROW_SCATTERED(27)
UPDATE_C_BZ_ROW_SCATTERED(28)
UPDATE_C_BZ_ROW_SCATTERED(29)
UPDATE_C_BZ_ROW_SCATTERED(30)
UPDATE_C_BZ_ROW_SCATTERED(31)
LABEL(END)
#ifdef MONITORS
RDTSC
MOV(VAR(botl), EAX)
MOV(VAR(both), EDX)
#endif
: // output operands
#ifdef MONITORS
[topl] "=m" (topl),
[toph] "=m" (toph),
[midl] "=m" (midl),
[midh] "=m" (midh),
[mid2l] "=m" (mid2l),
[mid2h] "=m" (mid2h),
[botl] "=m" (botl),
[both] "=m" (both)
#endif
: // input operands
[k] "m" (k),
[a] "m" (a),
[b] "m" (b),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[a_next] "m" (a_next),
[b_next] "m" (b_next),
[offsetPtr] "m" (offsetPtr)
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
#ifdef LOOPMON
printf("looptime = \t%d\n", bloopl - tloopl);
#endif
#ifdef MONITORS
dim_t top = ((dim_t)toph << 32) | topl;
dim_t mid = ((dim_t)midh << 32) | midl;
dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
dim_t bot = ((dim_t)both << 32) | botl;
printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
#endif
}

View File

@@ -32,13 +32,19 @@
*/
GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 )
GEMM_UKR_PROT( double, s, gemm_knl_asm_24x16 )
GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 )
GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 )
GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 )
PACKM_KER_PROT( double, s, packm_knl_asm_24xk )
PACKM_KER_PROT( double, s, packm_knl_asm_16xk )
PACKM_KER_PROT( double, d, packm_knl_asm_24xk )
PACKM_KER_PROT( double, d, packm_knl_asm_8xk )
// unused:
GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 )
GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 )
GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 )
PACKM_KER_PROT( double, d, packm_knl_asm_30xk )

View File

@@ -1,173 +0,0 @@
#ifndef BLIS_AVX512_MACROS_H
#define BLIS_AVX512_MACROS_H
//
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
//
#define COMMENT_BEGIN "#"
#define COMMENT_END
#define STRINGIFY(...) #__VA_ARGS__
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
#define LABEL(label) STRINGIFY(label) ":\n\t"
#define XMM(x) %%xmm##x
#define YMM(x) %%ymm##x
#define ZMM(x) %%zmm##x
#define EAX %%eax
#define EBX %%ebx
#define ECX %%ecx
#define EDX %%edx
#define EBP %%ebp
#define EDI %%edi
#define ESI %%esi
#define RAX %%rax
#define RBX %%rbx
#define RCX %%rcx
#define RDX %%rdx
#define RBP %%rbp
#define RDI %%rdi
#define RSI %%rsi
#define K(x) %%k##x
#define R(x) %%r##x
#define R8 %%r8
#define R9 %%r9
#define R10 %%r10
#define R11 %%r11
#define R12 %%r12
#define R13 %%r13
#define R14 %%r14
#define R15 %%r15
#define RD(x) %%r##x##d
#define R8D %%r8d
#define R9D %%r9d
#define R10D %%r10d
#define R11D %%r11d
#define R12D %%r12d
#define R13D %%r13d
#define R14D %%r14d
#define R15D %%r15d
#define IMM(x) $##x
#define VAR(x) %[x]
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
#define MEM_3(reg,off,scale) (reg,off,scale)
#define MEM_2(reg,disp) disp(reg)
#define MEM_1(reg) (reg)
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
#define MASK_K(n) %{%%k##n%}
#define MASK_KZ(n) %{%%k##n%}%{z%}
#define KMOV(to,from) ASM(kmovw from, to)
#define JKNZD(kreg,label) \
ASM(kortestw kreg, kreg) \
ASM(jnz label)
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
#define ALIGN16 ASM(.p2align 4)
#define ALIGN32 ASM(.p2align 5)
#define RDTSC ASM(rdstc)
#define MOV(_0, _1) ASM(mov _1, _0)
#define MOVD(_0, _1) ASM(movd _1, _0)
#define MOVL(_0, _1) ASM(movl _1, _0)
#define MOVQ(_0, _1) ASM(movq _1, _0)
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
#define CMP(_0, _1) ASM(cmp _1, _0)
#define AND(_0, _1) ASM(and _1, _0)
#define ADD(_0, _1) ASM(add _1, _0)
#define SUB(_0, _1) ASM(sub _1, _0)
#define SAL(_0, _1) ASM(sal _1, _0)
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
#define SAR(_0, _1) ASM(sar _1, _0)
#define SAL1(_0) ASM(sal _0)
#define SAR1(_0) ASM(sar _0)
#define LEA(_0, _1) ASM(lea _1, _0)
#define TEST(_0, _1) ASM(test _1, _0)
#define DEC(_0) ASM(dec _0)
#define JLE(_0) ASM(jle _0)
#define JL(_0) ASM(jl _0)
#define JNZ(_0) ASM(jnz _0)
#define JZ(_0) ASM(jz _0)
#define JNE(_0) ASM(jne _0)
#define JE(_0) ASM(je _0)
#define JNC(_0) ASM(jnc _0)
#define JC(_0) ASM(jc _0)
#define JMP(_0) ASM(jmp _0)
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
#define VZEROUPPER() ASM(vzeroupper)
#endif

View File

@@ -33,7 +33,6 @@
*/
#include "blis.h"
#include <assert.h>
#include "bli_avx512_macros.h"

View File

@@ -214,23 +214,17 @@ PDEF_MT := -DP_BEGIN=200 \
# --- Targets/rules ------------------------------------------------------------
#
all: blis-all openblas-all
all-st: blis-st openblas-st mkl-st
all-mt: blis-mt openblas-mt mkl-mt
intel: blis-all openblas-all mkl-all
blis-st: blis-gemm-st
blis-mt: blis-gemm-mt
amd: blis-all openblas-all acml-all
openblas-st: openblas-gemm-st
openblas-mt: openblas-gemm-mt
blis-all: blis-gemm-st \
blis-gemm-mt
openblas-all: openblas-gemm-st \
openblas-gemm-mt
mkl-all: mkl-gemm-st \
mkl-gemm-mt
acml-all: acml-gemm-st \
acml-gemm-mt
mkl-st: mkl-gemm-st
mkl-mt: mkl-gemm-mt
blis-gemm-st: \
test_sgemm_asm_blis_st.x \
@@ -294,18 +288,6 @@ mkl-gemm-mt: \
test_cgemm_mkl_mt.x \
test_zgemm_mkl_mt.x
acml-gemm-st: \
test_sgemm_acml_st.x \
test_dgemm_acml_st.x \
test_cgemm_acml_st.x \
test_zgemm_acml_st.x
acml-gemm-mt: \
test_sgemm_acml_mt.x \
test_dgemm_acml_mt.x \
test_cgemm_acml_mt.x \
test_zgemm_acml_mt.x
# --Object file rules --
@@ -466,31 +448,6 @@ test_z%_mkl_mt.o: test_%.c
test_c%_mkl_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
# acml
test_d%_acml_st.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
test_s%_acml_st.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
test_z%_acml_st.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
test_c%_acml_st.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
test_d%_acml_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
test_s%_acml_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
test_z%_acml_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
test_c%_acml_mt.o: test_%.c
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
# -- Executable file rules --
@@ -511,12 +468,6 @@ test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK)
test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK)
$(LINKER) $< $(MKLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_acml_st.x: test_%_acml_st.o $(LIBBLIS_LINK)
$(LINKER) $< $(ACML_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_acml_mt.x: test_%_acml_mt.o $(LIBBLIS_LINK)
$(LINKER) $< $(ACMLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK)
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@