mirror of
https://github.com/amd/blis.git
synced 2026-06-07 13:13:59 +00:00
Updates to knl kernels and related code.
Details: - Imported the 24x16 knl sgemm microkernel (and its corresonding spackm kernel) from TBLIS and enabled its use in the knl sub-config. Also Added sgemm microkernel prototype to bli_kernels_knl.h. - Updated dgemm and dpackm microkernels from TBLIS, which included an important change regarding the offsets array (changed from extern declaration to static declaration/definition). - Activated use of level-1v and -1f zen kernels in skx and knl sub-configs. - Removed some old macros no longer needed in bli_family_skx.h now that libmemkind support exists in configure. - Moved bli_avx512_macros.h to frame/include and adjusted #includes in skx and knl kernels accordingly. - Moved unused kernels in kernels/knl/3 to kernels/knl/3/other directory. - Fixed a minor bug in the 'make' output per compile when verboseness is not turned on. The rule-generating function 'make-kernel-rule' was previously passing in the name of the config, rather than the name of the kernel set returned by get-config-for-kset, which could give misleading information to the user when the kconfig_map mapped a kernel set to a sub-configuration that did not share the same name. (This didn't affect the CFLAGS that were actually used.) - Updated test/3m4m/Makefile, removing acml targets and renaming the remaining targets.
This commit is contained in:
2
Makefile
2
Makefile
@@ -537,7 +537,7 @@ $(BASE_OBJ_KERNELS_PATH)/$(1)/%.o: $(KERNELS_PATH)/$(1)/%.$(3) $(BLIS_H_FLAT) $(
|
||||
ifeq ($(BLIS_ENABLE_VERBOSE_MAKE_OUTPUT),yes)
|
||||
$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@
|
||||
else
|
||||
@echo "Compiling $$@" $(call get-kernel-text-for,$(1))
|
||||
@echo "Compiling $$@" $(call get-kernel-text-for,$(2))
|
||||
@$(CC) $(call get-kernel-cflags-for,$(2)) -c $$< -o $$@
|
||||
endif
|
||||
endef
|
||||
|
||||
@@ -47,8 +47,9 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
1,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
|
||||
2,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -61,26 +62,77 @@ void bli_cntx_init_knl( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 24, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MC ], -1, 120, -1, -1,
|
||||
-1, 144, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], -1, 336, -1, -1,
|
||||
-1, 420, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 14400, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 24, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_MC ], 240, 120, -1, -1,
|
||||
288, 144, -1, -1 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 336, 336, -1, -1,
|
||||
408, 408, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 14400, 14400, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -48,8 +48,54 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
|
||||
// gemm
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
// axpyf
|
||||
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
|
||||
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
|
||||
// dotxf
|
||||
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
|
||||
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1v kernels.
|
||||
bli_cntx_set_l1v_kers
|
||||
(
|
||||
10,
|
||||
// amaxv
|
||||
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
|
||||
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
|
||||
// axpyv
|
||||
#if 0
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
|
||||
#else
|
||||
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
|
||||
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
|
||||
#endif
|
||||
// dotv
|
||||
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
|
||||
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
|
||||
// dotxv
|
||||
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
|
||||
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
|
||||
// scalv
|
||||
#if 0
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
|
||||
#else
|
||||
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
|
||||
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
|
||||
#endif
|
||||
cntx
|
||||
);
|
||||
|
||||
@@ -59,19 +105,25 @@ void bli_cntx_init_skx( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256,
|
||||
480, 480, 256, 256 );
|
||||
480, 480, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NAT, 7,
|
||||
// level-3
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
// level-1f
|
||||
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
|
||||
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
@@ -50,10 +50,10 @@
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
#include <stdlib.h>
|
||||
//#include <stdlib.h>
|
||||
|
||||
#define BLIS_MALLOC_POOL malloc
|
||||
#define BLIS_FREE_POOL free
|
||||
//#define BLIS_MALLOC_POOL malloc
|
||||
//#define BLIS_FREE_POOL free
|
||||
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -62,6 +62,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
cntx
|
||||
);
|
||||
|
||||
// Update the context with optimized level-1f kernels.
|
||||
bli_cntx_set_l1f_kers
|
||||
(
|
||||
4,
|
||||
@@ -115,8 +116,8 @@ void bli_cntx_init_zen( cntx_t* cntx )
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
|
||||
@@ -15,8 +15,8 @@ arm64: cortexa57 generic
|
||||
arm32: cortexa15 cortexa9 generic
|
||||
|
||||
# Intel architectures.
|
||||
skx: skx
|
||||
knl: knl
|
||||
skx: skx/skx/zen
|
||||
knl: knl/knl/zen
|
||||
haswell: haswell/haswell/zen
|
||||
sandybridge: sandybridge
|
||||
penryn: penryn
|
||||
|
||||
2
configure
vendored
2
configure
vendored
@@ -614,7 +614,7 @@ build_kconfig_registry()
|
||||
|
||||
for config in ${clist}; do
|
||||
|
||||
# Look up the kernel for the current sub-configuration.
|
||||
# Look up the kernels for the current sub-configuration.
|
||||
#kernels="${kernel_registry[${config}]}"
|
||||
kernels=$(query_array "kernel_registry" ${config})
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "../3/bli_avx512_macros.h"
|
||||
#include "bli_avx512_macros.h"
|
||||
#include "blis.h"
|
||||
|
||||
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
|
||||
@@ -100,7 +100,9 @@
|
||||
VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD))
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
extern int32_t offsets[24];
|
||||
static int32_t offsets[32] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
|
||||
|
||||
void bli_dpackm_knl_asm_8xk
|
||||
(
|
||||
563
kernels/knl/1m/bli_spackm_knl_asm_24x16.c
Normal file
563
kernels/knl/1m/bli_spackm_knl_asm_24x16.c
Normal file
@@ -0,0 +1,563 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "bli_avx512_macros.h"
|
||||
#include "blis.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
|
||||
z0,z1,z2,z3,z4,z5,z6,z7) \
|
||||
\
|
||||
VMULPS(YMM(z0), YMM(15), MEM(a, o)) \
|
||||
VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \
|
||||
VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \
|
||||
VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \
|
||||
VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \
|
||||
VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \
|
||||
VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \
|
||||
VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o))
|
||||
|
||||
#define STORE8x8(a,o,s, \
|
||||
z0,z1,z2,z3,z4,z5,z6,z7) \
|
||||
\
|
||||
VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \
|
||||
VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \
|
||||
VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \
|
||||
VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \
|
||||
VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \
|
||||
VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \
|
||||
VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \
|
||||
VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7))
|
||||
|
||||
#define STORETRANS8x8(a,o,s, \
|
||||
a0,a1,a2,a3,a4,a5,a6,a7, \
|
||||
t0,t1,t2,t3,t4,t5) \
|
||||
\
|
||||
VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \
|
||||
VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \
|
||||
VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \
|
||||
VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \
|
||||
\
|
||||
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
|
||||
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
|
||||
VMOVUPS(MEM(a,(o )+0*(s)), XMM(t4)) \
|
||||
VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \
|
||||
VEXTRACTF128(MEM(a,(o )+4*(s)), YMM(t4), IMM(1)) \
|
||||
VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \
|
||||
\
|
||||
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
|
||||
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
|
||||
VMOVUPS(MEM(a,(o )+1*(s)), XMM(t4)) \
|
||||
VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \
|
||||
VEXTRACTF128(MEM(a,(o )+5*(s)), YMM(t4), IMM(1)) \
|
||||
VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \
|
||||
\
|
||||
VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \
|
||||
VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \
|
||||
VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \
|
||||
VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \
|
||||
\
|
||||
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
|
||||
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
|
||||
VMOVUPS(MEM(a,(o )+2*(s)), XMM(t4)) \
|
||||
VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \
|
||||
VEXTRACTF128(MEM(a,(o )+6*(s)), YMM(t4), IMM(1)) \
|
||||
VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \
|
||||
\
|
||||
VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
|
||||
VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
|
||||
VMOVUPS(MEM(a,(o )+3*(s)), XMM(t4)) \
|
||||
VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \
|
||||
VEXTRACTF128(MEM(a,(o )+7*(s)), YMM(t4), IMM(1)) \
|
||||
VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1))
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
static int32_t offsets[32] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
|
||||
|
||||
void bli_spackm_knl_asm_16xk
|
||||
(
|
||||
conj_t conja,
|
||||
dim_t n_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_,
|
||||
cntx_t* restrict ctnx
|
||||
)
|
||||
{
|
||||
(void)conja;
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
float* a = (float*)a_;
|
||||
float* p = (float*)p_;
|
||||
float* kappa = (float*)kappa_;
|
||||
const int64_t n = n_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
MOV(RCX, VAR(lda))
|
||||
MOV(R14, VAR(p))
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(PACK16_DONE)
|
||||
|
||||
LEA(RBX, MEM(,RBX,4)) //inca in bytes
|
||||
LEA(RCX, MEM(,RCX,4)) //lda in bytes
|
||||
|
||||
VBROADCASTSS(YMM(15), VAR(kappa))
|
||||
|
||||
CMP(RBX, IMM(4))
|
||||
JNE(PACK16_T)
|
||||
|
||||
LABEL(PACK16_N)
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK16_N_TAIL)
|
||||
|
||||
LEA(R8, MEM(RCX,RCX,2)) //lda*3
|
||||
LEA(R9, MEM(RCX,RCX,4)) //lda*5
|
||||
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
|
||||
|
||||
LABEL(PACK16_N_LOOP)
|
||||
|
||||
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
|
||||
|
||||
LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(R14, MEM(R14,16*8*4))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK16_N_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK16_DONE)
|
||||
|
||||
LABEL(PACK16_N_TAIL)
|
||||
|
||||
VMULPS(YMM(0), YMM(15), MEM(RAX ))
|
||||
VMULPS(YMM(1), YMM(15), MEM(RAX,32))
|
||||
VMOVUPS(MEM(R14 ), YMM(0))
|
||||
VMOVUPS(MEM(R14,32), YMM(1))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(R14, MEM(R14, 16*4))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
JNZ(PACK16_N_TAIL)
|
||||
|
||||
JMP(PACK16_DONE)
|
||||
|
||||
LABEL(PACK16_T)
|
||||
|
||||
CMP(RCX, IMM(4))
|
||||
JNE(PACK16_G)
|
||||
|
||||
LEA(R8, MEM(RBX,RBX,2)) //inca*3
|
||||
LEA(R9, MEM(RBX,RBX,4)) //inca*5
|
||||
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
|
||||
LEA(R11, MEM(RAX,RBX,8))
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK16_T_TAIL)
|
||||
|
||||
LABEL(PACK16_T_LOOP)
|
||||
|
||||
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
|
||||
|
||||
LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
|
||||
|
||||
LEA(RAX, MEM(RAX, 8*4))
|
||||
LEA(R11, MEM(R11, 8*4))
|
||||
LEA(R14, MEM(R14,16*8*4))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK16_T_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK16_DONE)
|
||||
|
||||
LABEL(PACK16_T_TAIL)
|
||||
|
||||
VMULSS(XMM(0), XMM(15), MEM(RAX ))
|
||||
VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
|
||||
VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
|
||||
VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
|
||||
VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
|
||||
VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
|
||||
VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
|
||||
VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
|
||||
VMOVSS(MEM(R14,0*4), XMM(0))
|
||||
VMOVSS(MEM(R14,1*4), XMM(1))
|
||||
VMOVSS(MEM(R14,2*4), XMM(2))
|
||||
VMOVSS(MEM(R14,3*4), XMM(3))
|
||||
VMOVSS(MEM(R14,4*4), XMM(4))
|
||||
VMOVSS(MEM(R14,5*4), XMM(5))
|
||||
VMOVSS(MEM(R14,6*4), XMM(6))
|
||||
VMOVSS(MEM(R14,7*4), XMM(7))
|
||||
|
||||
VMULSS(XMM(0), XMM(15), MEM(R11 ))
|
||||
VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
|
||||
VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
|
||||
VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
|
||||
VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
|
||||
VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
|
||||
VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
|
||||
VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
|
||||
VMOVSS(MEM(R14, 8*4), XMM(0))
|
||||
VMOVSS(MEM(R14, 9*4), XMM(1))
|
||||
VMOVSS(MEM(R14,10*4), XMM(2))
|
||||
VMOVSS(MEM(R14,11*4), XMM(3))
|
||||
VMOVSS(MEM(R14,12*4), XMM(4))
|
||||
VMOVSS(MEM(R14,13*4), XMM(5))
|
||||
VMOVSS(MEM(R14,14*4), XMM(6))
|
||||
VMOVSS(MEM(R14,15*4), XMM(7))
|
||||
|
||||
LEA(RAX, MEM(RAX, 4))
|
||||
LEA(R11, MEM(R11, 4))
|
||||
LEA(R14, MEM(R14,16*4))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
JNZ(PACK16_T_TAIL)
|
||||
|
||||
JMP(PACK16_DONE)
|
||||
|
||||
LABEL(PACK16_G)
|
||||
|
||||
VPBROADCASTD(ZMM(3), VAR(inca))
|
||||
MOV(RBX, VAR(offsetPtr))
|
||||
VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
|
||||
|
||||
LABEL(PACK16_G_LOOP)
|
||||
|
||||
KXNORW(K(1), K(0), K(0))
|
||||
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
|
||||
VMULPS(ZMM(3), ZMM(3), ZMM(15))
|
||||
VMOVUPS(MEM(R14), ZMM(3))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(R14, MEM(R14, 16*4))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK16_G_LOOP)
|
||||
|
||||
LABEL(PACK16_DONE)
|
||||
|
||||
: //output operands
|
||||
: //input operands
|
||||
[n] "m" (n),
|
||||
[kappa] "m" (*kappa),
|
||||
[a] "m" (a),
|
||||
[inca] "m" (inca),
|
||||
[lda] "m" (lda),
|
||||
[p] "m" (p),
|
||||
[ldp] "m" (ldp),
|
||||
[offsetPtr] "m" (offsetPtr)
|
||||
: //clobbers
|
||||
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
|
||||
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
|
||||
"zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
|
||||
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
|
||||
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
void bli_spackm_knl_asm_24xk
|
||||
(
|
||||
conj_t conja,
|
||||
dim_t n_,
|
||||
void* restrict kappa_,
|
||||
void* restrict a_, inc_t inca_, inc_t lda_,
|
||||
void* restrict p_, inc_t ldp_,
|
||||
cntx_t* restrict ctnx
|
||||
)
|
||||
{
|
||||
(void)conja;
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
float* a = (float*)a_;
|
||||
float* p = (float*)p_;
|
||||
float* kappa = (float*)kappa_;
|
||||
const int64_t n = n_;
|
||||
const int64_t inca = inca_;
|
||||
const int64_t lda = lda_;
|
||||
const int64_t ldp = ldp_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
MOV(RSI, VAR(n))
|
||||
MOV(RAX, VAR(a))
|
||||
MOV(RBX, VAR(inca))
|
||||
MOV(RCX, VAR(lda))
|
||||
MOV(R14, VAR(p))
|
||||
MOV(RDI, VAR(ldp))
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
LEA(RBX, MEM(,RBX,4)) //inca in bytes
|
||||
LEA(RCX, MEM(,RCX,4)) //lda in bytes
|
||||
LEA(RDI, MEM(,RDI,4)) //ldp in bytes
|
||||
|
||||
VBROADCASTSS(ZMM(15), VAR(kappa))
|
||||
|
||||
CMP(RBX, IMM(4))
|
||||
JNE(PACK24_T)
|
||||
|
||||
LABEL(PACK24_N)
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK24_N_TAIL)
|
||||
|
||||
LEA(R8, MEM(RCX,RCX,2)) //lda*3
|
||||
LEA(R9, MEM(RCX,RCX,4)) //lda*5
|
||||
LEA(R10, MEM(R8 ,RCX,4)) //lda*7
|
||||
|
||||
LABEL(PACK24_N_LOOP)
|
||||
|
||||
LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
|
||||
|
||||
LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
|
||||
|
||||
LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(R14, MEM(R14,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_N_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_N_TAIL)
|
||||
|
||||
VMULPS(ZMM(0), ZMM(15), MEM(RAX))
|
||||
VMOVUPS(MEM(R14), ZMM(0))
|
||||
|
||||
VMULPS(YMM(1), YMM(15), MEM(RAX,64))
|
||||
VMOVUPS(MEM(R14,64), YMM(1))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(R14, MEM(R14,RDI,1))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
JNZ(PACK24_N_TAIL)
|
||||
|
||||
JMP(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_T)
|
||||
|
||||
CMP(RCX, IMM(4))
|
||||
JNE(PACK24_G)
|
||||
|
||||
LEA(R8, MEM(RBX,RBX,2)) //inca*3
|
||||
LEA(R9, MEM(RBX,RBX,4)) //inca*5
|
||||
LEA(R10, MEM(R8 ,RBX,4)) //inca*7
|
||||
LEA(R11, MEM(RAX,RBX,8))
|
||||
LEA(R12, MEM(R11,RBX,8))
|
||||
|
||||
MOV(RDX, RSI)
|
||||
AND(RDX, IMM(7))
|
||||
SAR(RSI, IMM(3))
|
||||
JZ(PACK24_T_TAIL)
|
||||
|
||||
LABEL(PACK24_T_LOOP)
|
||||
|
||||
LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
|
||||
|
||||
LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
|
||||
|
||||
LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
|
||||
STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,8))
|
||||
LEA(R11, MEM(R11,RCX,8))
|
||||
LEA(R12, MEM(R12,RCX,8))
|
||||
LEA(R14, MEM(R14,RDI,8))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_T_LOOP)
|
||||
|
||||
TEST(RDX, RDX)
|
||||
JZ(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_T_TAIL)
|
||||
|
||||
VMULSS(XMM(0), XMM(15), MEM(RAX))
|
||||
VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
|
||||
VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
|
||||
VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
|
||||
VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
|
||||
VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
|
||||
VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
|
||||
VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
|
||||
VMOVSS(MEM(R14,0*4), XMM(0))
|
||||
VMOVSS(MEM(R14,1*4), XMM(1))
|
||||
VMOVSS(MEM(R14,2*4), XMM(2))
|
||||
VMOVSS(MEM(R14,3*4), XMM(3))
|
||||
VMOVSS(MEM(R14,4*4), XMM(4))
|
||||
VMOVSS(MEM(R14,5*4), XMM(5))
|
||||
VMOVSS(MEM(R14,6*4), XMM(6))
|
||||
VMOVSS(MEM(R14,7*4), XMM(7))
|
||||
|
||||
VMULSS(XMM(0), XMM(15), MEM(R11))
|
||||
VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
|
||||
VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
|
||||
VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
|
||||
VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
|
||||
VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
|
||||
VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
|
||||
VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
|
||||
VMOVSS(MEM(R14, 8*4), XMM(0))
|
||||
VMOVSS(MEM(R14, 9*4), XMM(1))
|
||||
VMOVSS(MEM(R14,10*4), XMM(2))
|
||||
VMOVSS(MEM(R14,11*4), XMM(3))
|
||||
VMOVSS(MEM(R14,12*4), XMM(4))
|
||||
VMOVSS(MEM(R14,13*4), XMM(5))
|
||||
VMOVSS(MEM(R14,14*4), XMM(6))
|
||||
VMOVSS(MEM(R14,15*4), XMM(7))
|
||||
|
||||
VMULSS(XMM(0), XMM(15), MEM(R12))
|
||||
VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
|
||||
VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
|
||||
VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
|
||||
VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
|
||||
VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
|
||||
VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
|
||||
VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
|
||||
VMOVSS(MEM(R14,16*4), XMM(0))
|
||||
VMOVSS(MEM(R14,17*4), XMM(1))
|
||||
VMOVSS(MEM(R14,18*4), XMM(2))
|
||||
VMOVSS(MEM(R14,19*4), XMM(3))
|
||||
VMOVSS(MEM(R14,20*4), XMM(4))
|
||||
VMOVSS(MEM(R14,21*4), XMM(5))
|
||||
VMOVSS(MEM(R14,22*4), XMM(6))
|
||||
VMOVSS(MEM(R14,23*4), XMM(7))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(R11, MEM(R11,RCX,1))
|
||||
LEA(R12, MEM(R12,RCX,1))
|
||||
LEA(R14, MEM(R14,RDI,1))
|
||||
|
||||
SUB(RDX, IMM(1))
|
||||
|
||||
JNZ(PACK24_T_TAIL)
|
||||
|
||||
JMP(PACK24_DONE)
|
||||
|
||||
LABEL(PACK24_G)
|
||||
|
||||
VPBROADCASTD(ZMM(3), VAR(inca))
|
||||
MOV(RBX, VAR(offsetPtr))
|
||||
VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
|
||||
|
||||
LEA(R11, MEM(RAX,RBX,8))
|
||||
LEA(R11, MEM(R11,RBX,8))
|
||||
|
||||
LABEL(PACK24_G_LOOP)
|
||||
|
||||
KXNORW(K(1), K(0), K(0))
|
||||
KSHIFTRW(K(2), K(1), IMM(8))
|
||||
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
|
||||
VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
|
||||
VMULPS(ZMM(3), ZMM(3), ZMM(15))
|
||||
VMULPS(YMM(4), YMM(4), YMM(15))
|
||||
VMOVUPS(MEM(R14), ZMM(3))
|
||||
VMOVUPS(MEM(R14,64), YMM(4))
|
||||
|
||||
LEA(RAX, MEM(RAX,RCX,1))
|
||||
LEA(R14, MEM(R14,RDI,1))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(PACK24_G_LOOP)
|
||||
|
||||
LABEL(PACK24_DONE)
|
||||
|
||||
: //output operands
|
||||
: //input operands
|
||||
[n] "m" (n),
|
||||
[kappa] "m" (*kappa),
|
||||
[a] "m" (a),
|
||||
[inca] "m" (inca),
|
||||
[lda] "m" (lda),
|
||||
[p] "m" (p),
|
||||
[ldp] "m" (ldp),
|
||||
[offsetPtr] "m" (offsetPtr)
|
||||
: //clobbers
|
||||
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
|
||||
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
|
||||
"zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
|
||||
"zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
|
||||
"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31",
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
|
||||
);
|
||||
}
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
*/
|
||||
|
||||
#include "../3/bli_avx512_macros.h"
|
||||
#include "bli_avx512_macros.h"
|
||||
#include "blis.h"
|
||||
|
||||
#define LOADMUL8x8(a,o,s1,s3,s5,s7, \
|
||||
@@ -176,7 +176,9 @@
|
||||
PREFETCH_B_L2(n)
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
extern int32_t offsets[24];
|
||||
static int32_t offsets[32] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
|
||||
|
||||
//#define MONITORS
|
||||
//#define LOOPMON
|
||||
@@ -224,7 +226,7 @@ void bli_dgemm_knl_asm_24x8
|
||||
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
|
||||
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
|
||||
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
|
||||
VMOVAPS(ZMM(14), ZMM(8))
|
||||
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
|
||||
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
|
||||
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
|
||||
#if SCATTER_PREFETCH_C
|
||||
@@ -259,11 +261,6 @@ void bli_dgemm_knl_asm_24x8
|
||||
MOV(VAR(midh), EDX)
|
||||
#endif
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
|
||||
|
||||
SUB(RSI, IMM(32))
|
||||
JLE(TAIL)
|
||||
|
||||
@@ -535,6 +532,7 @@ void bli_dgemm_knl_asm_24x8
|
||||
|
||||
MOV(RDX, RCX)
|
||||
ADD(RSI, IMM(32))
|
||||
JZ(POSTACCUM)
|
||||
|
||||
LABEL(TAIL_LOOP)
|
||||
|
||||
|
||||
708
kernels/knl/3/bli_sgemm_knl_asm_24x16.c
Normal file
708
kernels/knl/3/bli_sgemm_knl_asm_24x16.c
Normal file
@@ -0,0 +1,708 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
||||
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
#include "bli_avx512_macros.h"
|
||||
|
||||
#define UNROLL_K 32
|
||||
|
||||
#define SCATTER_PREFETCH_C 1
|
||||
|
||||
#define PREFETCH_A_L2 0
|
||||
#define PREFETCH_B_L2 0
|
||||
#define L2_PREFETCH_DIST 64
|
||||
|
||||
#define A_L1_PREFETCH_DIST 36
|
||||
#define B_L1_PREFETCH_DIST 18
|
||||
|
||||
#define LOOP_ALIGN ALIGN16
|
||||
|
||||
#define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX )) \
|
||||
VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
|
||||
VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
|
||||
VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
|
||||
VMOVUPS(MEM(RCX ), ZMM(R1)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
|
||||
VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
|
||||
VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,4))
|
||||
|
||||
#define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VMOVUPS(MEM(RCX ), ZMM(R1)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
|
||||
VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
|
||||
VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,4))
|
||||
|
||||
#define UPDATE_C_ROW_SCATTERED(NUM) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
|
||||
VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \
|
||||
VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \
|
||||
VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \
|
||||
ADD(RCX, RAX)
|
||||
|
||||
#define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
|
||||
VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \
|
||||
ADD(RCX, RAX)
|
||||
|
||||
#define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4))
|
||||
#define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64))
|
||||
|
||||
#if PREFETCH_A_L2
|
||||
#undef PREFETCH_A_L2
|
||||
|
||||
#define PREFETCH_A_L2(n) \
|
||||
\
|
||||
PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \
|
||||
PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64))
|
||||
|
||||
#else
|
||||
#undef PREFETCH_A_L2
|
||||
#define PREFETCH_A_L2(...)
|
||||
#endif
|
||||
|
||||
#define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4))
|
||||
|
||||
#if PREFETCH_B_L2
|
||||
#undef PREFETCH_B_L2
|
||||
|
||||
#define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4))
|
||||
|
||||
#else
|
||||
#undef PREFETCH_B_L2
|
||||
#define PREFETCH_B_L2(...)
|
||||
#endif
|
||||
|
||||
#define PREFETCH_C_L1_1
|
||||
#define PREFETCH_C_L1_2
|
||||
#define PREFETCH_C_L1_3
|
||||
|
||||
//
|
||||
// n: index in unrolled loop
|
||||
//
|
||||
// a: ZMM register to load into
|
||||
// b: ZMM register to read from
|
||||
//
|
||||
// ...: addressing for A, except for offset
|
||||
//
|
||||
#define SUBITER(n,a,b,...) \
|
||||
\
|
||||
PREFETCH_A_L2(n) \
|
||||
\
|
||||
VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \
|
||||
VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \
|
||||
VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \
|
||||
VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \
|
||||
PREFETCH_A_L1_1(n) \
|
||||
VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \
|
||||
VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \
|
||||
VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \
|
||||
PREFETCH_C_L1_1 \
|
||||
VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \
|
||||
VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \
|
||||
VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \
|
||||
PREFETCH_A_L1_2(n) \
|
||||
VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \
|
||||
VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \
|
||||
VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \
|
||||
PREFETCH_C_L1_2 \
|
||||
VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \
|
||||
VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \
|
||||
VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \
|
||||
PREFETCH_C_L1_3 \
|
||||
VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \
|
||||
VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \
|
||||
VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \
|
||||
PREFETCH_B_L1(n) \
|
||||
VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \
|
||||
VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \
|
||||
VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \
|
||||
PREFETCH_B_L2(n) \
|
||||
VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \
|
||||
VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \
|
||||
VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4))
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
static int32_t offsets[32] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
|
||||
|
||||
//#define MONITORS
|
||||
//#define LOOPMON
|
||||
void bli_sgemm_knl_asm_24x16
|
||||
(
|
||||
dim_t k_,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c_, inc_t cs_c_,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
(void)data;
|
||||
(void)cntx;
|
||||
|
||||
const double * a_next = bli_auxinfo_next_a( data );
|
||||
const double * b_next = bli_auxinfo_next_b( data );
|
||||
|
||||
const int32_t * offsetPtr = &offsets[0];
|
||||
const int64_t k = k_;
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
#ifdef MONITORS
|
||||
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
|
||||
#endif
|
||||
#ifdef LOOPMON
|
||||
int tlooph, tloopl, blooph, bloopl;
|
||||
#endif
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
MOV(VAR(topl), EAX)
|
||||
MOV(VAR(toph), EDX)
|
||||
#endif
|
||||
|
||||
VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
|
||||
VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c))
|
||||
VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index
|
||||
VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
|
||||
VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
|
||||
VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
|
||||
VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
|
||||
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
|
||||
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
|
||||
#if SCATTER_PREFETCH_C
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(18), ZMM(8))
|
||||
VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
|
||||
VMOVAPS(ZMM(20), ZMM(8))
|
||||
VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
|
||||
VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
|
||||
VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
|
||||
#else
|
||||
VMOVAPS(ZMM(17), ZMM(8))
|
||||
VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
|
||||
VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
|
||||
VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
|
||||
VMOVAPS(ZMM(21), ZMM(8))
|
||||
VMOVAPS(ZMM(22), ZMM(8))
|
||||
VMOVAPS(ZMM(23), ZMM(8))
|
||||
#endif
|
||||
VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(2))
|
||||
VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*4)) //offset for 4 iterations
|
||||
VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
|
||||
VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5
|
||||
VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7
|
||||
VMOVAPS(ZMM(29), ZMM(8))
|
||||
VMOVAPS(ZMM(30), ZMM(8))
|
||||
VMOVAPS(ZMM(31), ZMM(8))
|
||||
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
MOV(VAR(midl), EAX)
|
||||
MOV(VAR(midh), EDX)
|
||||
#endif
|
||||
|
||||
SUB(RSI, IMM(32))
|
||||
JLE(TAIL)
|
||||
|
||||
//prefetch C into L2
|
||||
#if SCATTER_PREFETCH_C
|
||||
ADD(RSI, IMM(24))
|
||||
KXNORW(K(1), K(0), K(0))
|
||||
KXNORW(K(2), K(0), K(0))
|
||||
VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
|
||||
VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
|
||||
#else
|
||||
PREFETCHW1(MEM(RCX ))
|
||||
SUBITER( 0,1,0,RAX )
|
||||
PREFETCHW1(MEM(RCX,R12,1))
|
||||
SUBITER( 1,0,1,RAX )
|
||||
PREFETCHW1(MEM(RCX,R12,2))
|
||||
SUBITER( 2,1,0,RAX )
|
||||
PREFETCHW1(MEM(RCX,R13,1))
|
||||
SUBITER( 3,0,1,RAX )
|
||||
PREFETCHW1(MEM(RCX,R12,4))
|
||||
SUBITER( 4,1,0,RAX,R8, 1)
|
||||
PREFETCHW1(MEM(RCX,R14,1))
|
||||
SUBITER( 5,0,1,RAX,R8, 1)
|
||||
PREFETCHW1(MEM(RCX,R13,2))
|
||||
SUBITER( 6,1,0,RAX,R8, 1)
|
||||
PREFETCHW1(MEM(RCX,R15,1))
|
||||
SUBITER( 7,0,1,RAX,R8, 1)
|
||||
|
||||
LEA(RDX, MEM(RCX,R12,8))
|
||||
|
||||
PREFETCHW1(MEM(RDX ))
|
||||
SUBITER( 8,1,0,RAX,R8, 2)
|
||||
PREFETCHW1(MEM(RDX,R12,1))
|
||||
SUBITER( 9,0,1,RAX,R8, 2)
|
||||
PREFETCHW1(MEM(RDX,R12,2))
|
||||
SUBITER(10,1,0,RAX,R8, 2)
|
||||
PREFETCHW1(MEM(RDX,R13,1))
|
||||
SUBITER(11,0,1,RAX,R8, 2)
|
||||
PREFETCHW1(MEM(RDX,R12,4))
|
||||
SUBITER(12,1,0,RAX,R9, 1)
|
||||
PREFETCHW1(MEM(RDX,R14,1))
|
||||
SUBITER(13,0,1,RAX,R9, 1)
|
||||
PREFETCHW1(MEM(RDX,R13,2))
|
||||
SUBITER(14,1,0,RAX,R9, 1)
|
||||
PREFETCHW1(MEM(RDX,R15,1))
|
||||
SUBITER(15,0,1,RAX,R9, 1)
|
||||
|
||||
LEA(RDI, MEM(RDX,R12,8))
|
||||
|
||||
PREFETCHW1(MEM(RDI ))
|
||||
SUBITER(16,1,0,RAX,R8, 4)
|
||||
PREFETCHW1(MEM(RDI,R12,1))
|
||||
SUBITER(17,0,1,RAX,R8, 4)
|
||||
PREFETCHW1(MEM(RDI,R12,2))
|
||||
SUBITER(18,1,0,RAX,R8, 4)
|
||||
PREFETCHW1(MEM(RDI,R13,1))
|
||||
SUBITER(19,0,1,RAX,R8, 4)
|
||||
PREFETCHW1(MEM(RDI,R12,4))
|
||||
SUBITER(20,1,0,RAX,R10,1)
|
||||
PREFETCHW1(MEM(RDI,R14,1))
|
||||
SUBITER(21,0,1,RAX,R10,1)
|
||||
PREFETCHW1(MEM(RDI,R13,2))
|
||||
SUBITER(22,1,0,RAX,R10,1)
|
||||
PREFETCHW1(MEM(RDI,R15,1))
|
||||
SUBITER(23,0,1,RAX,R10,1)
|
||||
|
||||
ADD(RAX, IMM(24*24*4))
|
||||
ADD(RBX, IMM(24*16*4))
|
||||
#endif
|
||||
|
||||
MOV(RDI, RSI)
|
||||
AND(RDI, IMM(31))
|
||||
SAR(RSI, IMM(5))
|
||||
JZ(REM_1)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(MAIN_LOOP)
|
||||
|
||||
SUBITER( 0,1,0,RAX )
|
||||
SUBITER( 1,0,1,RAX )
|
||||
SUBITER( 2,1,0,RAX )
|
||||
SUBITER( 3,0,1,RAX )
|
||||
SUBITER( 4,1,0,RAX,R8, 1)
|
||||
SUBITER( 5,0,1,RAX,R8, 1)
|
||||
SUBITER( 6,1,0,RAX,R8, 1)
|
||||
SUBITER( 7,0,1,RAX,R8, 1)
|
||||
SUBITER( 8,1,0,RAX,R8, 2)
|
||||
SUBITER( 9,0,1,RAX,R8, 2)
|
||||
SUBITER(10,1,0,RAX,R8, 2)
|
||||
SUBITER(11,0,1,RAX,R8, 2)
|
||||
SUBITER(12,1,0,RAX,R9, 1)
|
||||
SUBITER(13,0,1,RAX,R9, 1)
|
||||
SUBITER(14,1,0,RAX,R9, 1)
|
||||
SUBITER(15,0,1,RAX,R9, 1)
|
||||
SUBITER(16,1,0,RAX,R8, 4)
|
||||
SUBITER(17,0,1,RAX,R8, 4)
|
||||
SUBITER(18,1,0,RAX,R8, 4)
|
||||
SUBITER(19,0,1,RAX,R8, 4)
|
||||
SUBITER(20,1,0,RAX,R10,1)
|
||||
SUBITER(21,0,1,RAX,R10,1)
|
||||
SUBITER(22,1,0,RAX,R10,1)
|
||||
SUBITER(23,0,1,RAX,R10,1)
|
||||
SUBITER(24,1,0,RAX,R9, 2)
|
||||
SUBITER(25,0,1,RAX,R9, 2)
|
||||
SUBITER(26,1,0,RAX,R9, 2)
|
||||
SUBITER(27,0,1,RAX,R9, 2)
|
||||
SUBITER(28,1,0,RAX,R11,1)
|
||||
SUBITER(29,0,1,RAX,R11,1)
|
||||
SUBITER(30,1,0,RAX,R11,1)
|
||||
SUBITER(31,0,1,RAX,R11,1)
|
||||
|
||||
ADD(RAX, IMM(32*24*4))
|
||||
ADD(RBX, IMM(32*16*4))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(MAIN_LOOP)
|
||||
|
||||
LABEL(REM_1)
|
||||
SAR1(RDI)
|
||||
JNC(REM_2)
|
||||
|
||||
SUBITER(0,1,0,RAX)
|
||||
VMOVAPD(ZMM(0), ZMM(1))
|
||||
ADD(RAX, IMM(24*4))
|
||||
ADD(RBX, IMM(16*4))
|
||||
|
||||
LABEL(REM_2)
|
||||
SAR1(RDI)
|
||||
JNC(REM_4)
|
||||
|
||||
SUBITER(0,1,0,RAX)
|
||||
SUBITER(1,0,1,RAX)
|
||||
ADD(RAX, IMM(2*24*4))
|
||||
ADD(RBX, IMM(2*16*4))
|
||||
|
||||
LABEL(REM_4)
|
||||
SAR1(RDI)
|
||||
JNC(REM_8)
|
||||
|
||||
SUBITER(0,1,0,RAX)
|
||||
SUBITER(1,0,1,RAX)
|
||||
SUBITER(2,1,0,RAX)
|
||||
SUBITER(3,0,1,RAX)
|
||||
ADD(RAX, IMM(4*24*4))
|
||||
ADD(RBX, IMM(4*16*4))
|
||||
|
||||
LABEL(REM_8)
|
||||
SAR1(RDI)
|
||||
JNC(REM_16)
|
||||
|
||||
SUBITER(0,1,0,RAX )
|
||||
SUBITER(1,0,1,RAX )
|
||||
SUBITER(2,1,0,RAX )
|
||||
SUBITER(3,0,1,RAX )
|
||||
SUBITER(4,1,0,RAX,R8,1)
|
||||
SUBITER(5,0,1,RAX,R8,1)
|
||||
SUBITER(6,1,0,RAX,R8,1)
|
||||
SUBITER(7,0,1,RAX,R8,1)
|
||||
ADD(RAX, IMM(8*24*4))
|
||||
ADD(RBX, IMM(8*16*4))
|
||||
|
||||
LABEL(REM_16)
|
||||
SAR1(RDI)
|
||||
JNC(AFTER_LOOP)
|
||||
|
||||
SUBITER( 0,1,0,RAX )
|
||||
SUBITER( 1,0,1,RAX )
|
||||
SUBITER( 2,1,0,RAX )
|
||||
SUBITER( 3,0,1,RAX )
|
||||
SUBITER( 4,1,0,RAX,R8, 1)
|
||||
SUBITER( 5,0,1,RAX,R8, 1)
|
||||
SUBITER( 6,1,0,RAX,R8, 1)
|
||||
SUBITER( 7,0,1,RAX,R8, 1)
|
||||
SUBITER( 8,1,0,RAX,R8, 2)
|
||||
SUBITER( 9,0,1,RAX,R8, 2)
|
||||
SUBITER(10,1,0,RAX,R8, 2)
|
||||
SUBITER(11,0,1,RAX,R8, 2)
|
||||
SUBITER(12,1,0,RAX,R9, 1)
|
||||
SUBITER(13,0,1,RAX,R9, 1)
|
||||
SUBITER(14,1,0,RAX,R9, 1)
|
||||
SUBITER(15,0,1,RAX,R9, 1)
|
||||
ADD(RAX, IMM(16*24*4))
|
||||
ADD(RBX, IMM(16*16*4))
|
||||
|
||||
LABEL(AFTER_LOOP)
|
||||
|
||||
//prefetch C into L1
|
||||
#if SCATTER_PREFETCH_C
|
||||
KXNORW(K(1), K(0), K(0))
|
||||
KXNORW(K(2), K(0), K(0))
|
||||
VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
|
||||
VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
|
||||
|
||||
SUBITER(0,1,0,RAX )
|
||||
SUBITER(1,0,1,RAX )
|
||||
SUBITER(2,1,0,RAX )
|
||||
SUBITER(3,0,1,RAX )
|
||||
SUBITER(4,1,0,RAX,R8,1)
|
||||
SUBITER(5,0,1,RAX,R8,1)
|
||||
SUBITER(6,1,0,RAX,R8,1)
|
||||
SUBITER(7,0,1,RAX,R8,1)
|
||||
#else
|
||||
|
||||
LEA(RDX, MEM(RCX,R12,8))
|
||||
LEA(RDI, MEM(RDX,R12,8))
|
||||
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX ))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
|
||||
SUBITER(0,1,0,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
|
||||
SUBITER(1,0,1,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX ))
|
||||
SUBITER(2,1,0,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
|
||||
SUBITER(3,0,1,RAX )
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
|
||||
SUBITER(4,1,0,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI ))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
|
||||
SUBITER(5,0,1,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
|
||||
SUBITER(6,1,0,RAX,R8,1)
|
||||
#undef PREFETCH_C_L1_1
|
||||
#undef PREFETCH_C_L1_2
|
||||
#undef PREFETCH_C_L1_3
|
||||
#define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
|
||||
#define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
|
||||
#define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
|
||||
SUBITER(7,0,1,RAX,R8,1)
|
||||
#endif
|
||||
|
||||
JMP(POSTACCUM)
|
||||
|
||||
LABEL(TAIL)
|
||||
|
||||
MOV(RDX, RCX)
|
||||
ADD(RSI, IMM(32))
|
||||
JZ(POSTACCUM)
|
||||
|
||||
LABEL(TAIL_LOOP)
|
||||
|
||||
PREFETCHW0(MEM(RDX))
|
||||
ADD(RDX, R12)
|
||||
|
||||
SUBITER(0,1,0,RAX)
|
||||
VMOVAPD(ZMM(0), ZMM(1))
|
||||
ADD(RAX, IMM(24*4))
|
||||
ADD(RBX, IMM(16*4))
|
||||
|
||||
SUB(RSI, IMM(1))
|
||||
|
||||
JNZ(TAIL_LOOP)
|
||||
|
||||
LABEL(POSTACCUM)
|
||||
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
MOV(VAR(mid2l), EAX)
|
||||
MOV(VAR(mid2h), EDX)
|
||||
#endif
|
||||
|
||||
MOV(RAX, VAR(alpha))
|
||||
MOV(RBX, VAR(beta))
|
||||
VBROADCASTSS(ZMM(0), MEM(RAX))
|
||||
VBROADCASTSS(ZMM(1), MEM(RBX))
|
||||
|
||||
// Check if C is row stride. If not, jump to the slow scattered update
|
||||
MOV(RAX, VAR(rs_c))
|
||||
LEA(RAX, MEM(,RAX,4))
|
||||
MOV(RBX, VAR(cs_c))
|
||||
LEA(RDI, MEM(RAX,RAX,2))
|
||||
CMP(RBX, IMM(1))
|
||||
JNE(SCATTEREDUPDATE)
|
||||
|
||||
VMOVD(EDX, XMM(1))
|
||||
SAL1(EDX) //shift out sign bit
|
||||
JZ(COLSTORBZ)
|
||||
|
||||
UPDATE_C_FOUR_ROWS( 8, 9,10,11)
|
||||
UPDATE_C_FOUR_ROWS(12,13,14,15)
|
||||
UPDATE_C_FOUR_ROWS(16,17,18,19)
|
||||
UPDATE_C_FOUR_ROWS(20,21,22,23)
|
||||
UPDATE_C_FOUR_ROWS(24,25,26,27)
|
||||
UPDATE_C_FOUR_ROWS(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
|
||||
LABEL(COLSTORBZ)
|
||||
|
||||
UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
|
||||
UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
|
||||
UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
|
||||
UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
|
||||
UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
|
||||
UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
|
||||
LABEL(SCATTEREDUPDATE)
|
||||
|
||||
MOV(RDI, VAR(offsetPtr))
|
||||
VMOVAPS(ZMM(2), MEM(RDI))
|
||||
/* Note that this ignores the upper 32 bits in cs_c */
|
||||
VPBROADCASTD(ZMM(3), EBX)
|
||||
VPMULLD(ZMM(2), ZMM(3), ZMM(2))
|
||||
|
||||
VMOVD(EDX, XMM(1))
|
||||
SAL1(EDX) //shift out sign bit
|
||||
JZ(SCATTERBZ)
|
||||
|
||||
UPDATE_C_ROW_SCATTERED( 8)
|
||||
UPDATE_C_ROW_SCATTERED( 9)
|
||||
UPDATE_C_ROW_SCATTERED(10)
|
||||
UPDATE_C_ROW_SCATTERED(11)
|
||||
UPDATE_C_ROW_SCATTERED(12)
|
||||
UPDATE_C_ROW_SCATTERED(13)
|
||||
UPDATE_C_ROW_SCATTERED(14)
|
||||
UPDATE_C_ROW_SCATTERED(15)
|
||||
UPDATE_C_ROW_SCATTERED(16)
|
||||
UPDATE_C_ROW_SCATTERED(17)
|
||||
UPDATE_C_ROW_SCATTERED(18)
|
||||
UPDATE_C_ROW_SCATTERED(19)
|
||||
UPDATE_C_ROW_SCATTERED(20)
|
||||
UPDATE_C_ROW_SCATTERED(21)
|
||||
UPDATE_C_ROW_SCATTERED(22)
|
||||
UPDATE_C_ROW_SCATTERED(23)
|
||||
UPDATE_C_ROW_SCATTERED(24)
|
||||
UPDATE_C_ROW_SCATTERED(25)
|
||||
UPDATE_C_ROW_SCATTERED(26)
|
||||
UPDATE_C_ROW_SCATTERED(27)
|
||||
UPDATE_C_ROW_SCATTERED(28)
|
||||
UPDATE_C_ROW_SCATTERED(29)
|
||||
UPDATE_C_ROW_SCATTERED(30)
|
||||
UPDATE_C_ROW_SCATTERED(31)
|
||||
|
||||
JMP(END)
|
||||
|
||||
LABEL(SCATTERBZ)
|
||||
|
||||
UPDATE_C_BZ_ROW_SCATTERED( 8)
|
||||
UPDATE_C_BZ_ROW_SCATTERED( 9)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(10)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(11)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(12)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(13)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(14)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(15)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(16)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(17)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(18)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(19)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(20)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(21)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(22)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(23)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(24)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(25)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(26)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(27)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(28)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(29)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(30)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(31)
|
||||
|
||||
LABEL(END)
|
||||
|
||||
#ifdef MONITORS
|
||||
RDTSC
|
||||
MOV(VAR(botl), EAX)
|
||||
MOV(VAR(both), EDX)
|
||||
#endif
|
||||
: // output operands
|
||||
#ifdef MONITORS
|
||||
[topl] "=m" (topl),
|
||||
[toph] "=m" (toph),
|
||||
[midl] "=m" (midl),
|
||||
[midh] "=m" (midh),
|
||||
[mid2l] "=m" (mid2l),
|
||||
[mid2h] "=m" (mid2h),
|
||||
[botl] "=m" (botl),
|
||||
[both] "=m" (both)
|
||||
#endif
|
||||
: // input operands
|
||||
[k] "m" (k),
|
||||
[a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[a_next] "m" (a_next),
|
||||
[b_next] "m" (b_next),
|
||||
[offsetPtr] "m" (offsetPtr)
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
|
||||
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
|
||||
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
|
||||
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
|
||||
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31", "memory"
|
||||
);
|
||||
|
||||
#ifdef LOOPMON
|
||||
printf("looptime = \t%d\n", bloopl - tloopl);
|
||||
#endif
|
||||
#ifdef MONITORS
|
||||
dim_t top = ((dim_t)toph << 32) | topl;
|
||||
dim_t mid = ((dim_t)midh << 32) | midl;
|
||||
dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
|
||||
dim_t bot = ((dim_t)both << 32) | botl;
|
||||
printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
|
||||
#endif
|
||||
}
|
||||
@@ -32,13 +32,19 @@
|
||||
|
||||
*/
|
||||
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 )
|
||||
GEMM_UKR_PROT( double, s, gemm_knl_asm_24x16 )
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 )
|
||||
|
||||
PACKM_KER_PROT( double, s, packm_knl_asm_24xk )
|
||||
PACKM_KER_PROT( double, s, packm_knl_asm_16xk )
|
||||
|
||||
PACKM_KER_PROT( double, d, packm_knl_asm_24xk )
|
||||
PACKM_KER_PROT( double, d, packm_knl_asm_8xk )
|
||||
|
||||
// unused:
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 )
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 )
|
||||
GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 )
|
||||
|
||||
PACKM_KER_PROT( double, d, packm_knl_asm_30xk )
|
||||
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
#ifndef BLIS_AVX512_MACROS_H
|
||||
#define BLIS_AVX512_MACROS_H
|
||||
|
||||
//
|
||||
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
|
||||
//
|
||||
|
||||
#define COMMENT_BEGIN "#"
|
||||
#define COMMENT_END
|
||||
|
||||
#define STRINGIFY(...) #__VA_ARGS__
|
||||
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
|
||||
#define LABEL(label) STRINGIFY(label) ":\n\t"
|
||||
|
||||
#define XMM(x) %%xmm##x
|
||||
#define YMM(x) %%ymm##x
|
||||
#define ZMM(x) %%zmm##x
|
||||
#define EAX %%eax
|
||||
#define EBX %%ebx
|
||||
#define ECX %%ecx
|
||||
#define EDX %%edx
|
||||
#define EBP %%ebp
|
||||
#define EDI %%edi
|
||||
#define ESI %%esi
|
||||
#define RAX %%rax
|
||||
#define RBX %%rbx
|
||||
#define RCX %%rcx
|
||||
#define RDX %%rdx
|
||||
#define RBP %%rbp
|
||||
#define RDI %%rdi
|
||||
#define RSI %%rsi
|
||||
#define K(x) %%k##x
|
||||
#define R(x) %%r##x
|
||||
#define R8 %%r8
|
||||
#define R9 %%r9
|
||||
#define R10 %%r10
|
||||
#define R11 %%r11
|
||||
#define R12 %%r12
|
||||
#define R13 %%r13
|
||||
#define R14 %%r14
|
||||
#define R15 %%r15
|
||||
#define RD(x) %%r##x##d
|
||||
#define R8D %%r8d
|
||||
#define R9D %%r9d
|
||||
#define R10D %%r10d
|
||||
#define R11D %%r11d
|
||||
#define R12D %%r12d
|
||||
#define R13D %%r13d
|
||||
#define R14D %%r14d
|
||||
#define R15D %%r15d
|
||||
#define IMM(x) $##x
|
||||
#define VAR(x) %[x]
|
||||
|
||||
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
|
||||
#define MEM_3(reg,off,scale) (reg,off,scale)
|
||||
#define MEM_2(reg,disp) disp(reg)
|
||||
#define MEM_1(reg) (reg)
|
||||
|
||||
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
|
||||
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
|
||||
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
|
||||
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
|
||||
|
||||
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
|
||||
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
|
||||
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
|
||||
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
|
||||
|
||||
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
|
||||
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
|
||||
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
|
||||
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
|
||||
|
||||
#define MASK_K(n) %{%%k##n%}
|
||||
#define MASK_KZ(n) %{%%k##n%}%{z%}
|
||||
#define KMOV(to,from) ASM(kmovw from, to)
|
||||
#define JKNZD(kreg,label) \
|
||||
ASM(kortestw kreg, kreg) \
|
||||
ASM(jnz label)
|
||||
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
|
||||
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
|
||||
|
||||
#define ALIGN16 ASM(.p2align 4)
|
||||
#define ALIGN32 ASM(.p2align 5)
|
||||
#define RDTSC ASM(rdstc)
|
||||
#define MOV(_0, _1) ASM(mov _1, _0)
|
||||
#define MOVD(_0, _1) ASM(movd _1, _0)
|
||||
#define MOVL(_0, _1) ASM(movl _1, _0)
|
||||
#define MOVQ(_0, _1) ASM(movq _1, _0)
|
||||
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
|
||||
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
|
||||
#define CMP(_0, _1) ASM(cmp _1, _0)
|
||||
#define AND(_0, _1) ASM(and _1, _0)
|
||||
#define ADD(_0, _1) ASM(add _1, _0)
|
||||
#define SUB(_0, _1) ASM(sub _1, _0)
|
||||
#define SAL(_0, _1) ASM(sal _1, _0)
|
||||
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
|
||||
#define SAR(_0, _1) ASM(sar _1, _0)
|
||||
#define SAL1(_0) ASM(sal _0)
|
||||
#define SAR1(_0) ASM(sar _0)
|
||||
#define LEA(_0, _1) ASM(lea _1, _0)
|
||||
#define TEST(_0, _1) ASM(test _1, _0)
|
||||
#define DEC(_0) ASM(dec _0)
|
||||
#define JLE(_0) ASM(jle _0)
|
||||
#define JL(_0) ASM(jl _0)
|
||||
#define JNZ(_0) ASM(jnz _0)
|
||||
#define JZ(_0) ASM(jz _0)
|
||||
#define JNE(_0) ASM(jne _0)
|
||||
#define JE(_0) ASM(je _0)
|
||||
#define JNC(_0) ASM(jnc _0)
|
||||
#define JC(_0) ASM(jc _0)
|
||||
#define JMP(_0) ASM(jmp _0)
|
||||
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
|
||||
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
|
||||
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
|
||||
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
|
||||
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
|
||||
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
|
||||
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
|
||||
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
|
||||
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
|
||||
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
|
||||
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
|
||||
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
|
||||
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
|
||||
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
|
||||
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
|
||||
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
|
||||
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
|
||||
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
|
||||
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
|
||||
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
|
||||
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
|
||||
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
|
||||
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
|
||||
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
|
||||
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
|
||||
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
|
||||
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
|
||||
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
|
||||
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
|
||||
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
|
||||
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
|
||||
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
|
||||
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
|
||||
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
|
||||
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
|
||||
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
|
||||
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
|
||||
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
|
||||
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
|
||||
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
|
||||
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
|
||||
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
|
||||
#define VINSERTF128(_0, _1, _2) ASM(vinsertf128 _2, _1, _0)
|
||||
#define VEXTRACTF128(_0, _1, _2) ASM(vextractf128 _2, _1, _0)
|
||||
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
|
||||
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
|
||||
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
|
||||
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
|
||||
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
|
||||
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
|
||||
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
|
||||
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
|
||||
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
|
||||
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
|
||||
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
|
||||
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
|
||||
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
|
||||
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
|
||||
#define VZEROUPPER() ASM(vzeroupper)
|
||||
|
||||
#endif
|
||||
@@ -33,7 +33,6 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
#include "bli_avx512_macros.h"
|
||||
|
||||
|
||||
@@ -214,23 +214,17 @@ PDEF_MT := -DP_BEGIN=200 \
|
||||
# --- Targets/rules ------------------------------------------------------------
|
||||
#
|
||||
|
||||
all: blis-all openblas-all
|
||||
all-st: blis-st openblas-st mkl-st
|
||||
all-mt: blis-mt openblas-mt mkl-mt
|
||||
|
||||
intel: blis-all openblas-all mkl-all
|
||||
blis-st: blis-gemm-st
|
||||
blis-mt: blis-gemm-mt
|
||||
|
||||
amd: blis-all openblas-all acml-all
|
||||
openblas-st: openblas-gemm-st
|
||||
openblas-mt: openblas-gemm-mt
|
||||
|
||||
blis-all: blis-gemm-st \
|
||||
blis-gemm-mt
|
||||
|
||||
openblas-all: openblas-gemm-st \
|
||||
openblas-gemm-mt
|
||||
|
||||
mkl-all: mkl-gemm-st \
|
||||
mkl-gemm-mt
|
||||
|
||||
acml-all: acml-gemm-st \
|
||||
acml-gemm-mt
|
||||
mkl-st: mkl-gemm-st
|
||||
mkl-mt: mkl-gemm-mt
|
||||
|
||||
blis-gemm-st: \
|
||||
test_sgemm_asm_blis_st.x \
|
||||
@@ -294,18 +288,6 @@ mkl-gemm-mt: \
|
||||
test_cgemm_mkl_mt.x \
|
||||
test_zgemm_mkl_mt.x
|
||||
|
||||
acml-gemm-st: \
|
||||
test_sgemm_acml_st.x \
|
||||
test_dgemm_acml_st.x \
|
||||
test_cgemm_acml_st.x \
|
||||
test_zgemm_acml_st.x
|
||||
|
||||
acml-gemm-mt: \
|
||||
test_sgemm_acml_mt.x \
|
||||
test_dgemm_acml_mt.x \
|
||||
test_cgemm_acml_mt.x \
|
||||
test_zgemm_acml_mt.x
|
||||
|
||||
|
||||
|
||||
# --Object file rules --
|
||||
@@ -466,31 +448,6 @@ test_z%_mkl_mt.o: test_%.c
|
||||
test_c%_mkl_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
|
||||
|
||||
# acml
|
||||
test_d%_acml_st.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_s%_acml_st.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_z%_acml_st.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_c%_acml_st.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@
|
||||
|
||||
test_d%_acml_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_s%_acml_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_z%_acml_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
|
||||
|
||||
test_c%_acml_mt.o: test_%.c
|
||||
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@
|
||||
|
||||
|
||||
# -- Executable file rules --
|
||||
|
||||
@@ -511,12 +468,6 @@ test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK)
|
||||
test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(MKLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_acml_st.x: test_%_acml_st.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(ACML_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_acml_mt.x: test_%_acml_mt.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(ACMLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK)
|
||||
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@
|
||||
|
||||
|
||||
Reference in New Issue
Block a user