From 4423e33dc593115cda92c5763d756d7ad1298aa9 Mon Sep 17 00:00:00 2001 From: dnp Date: Wed, 6 Dec 2017 16:35:03 -0600 Subject: [PATCH] Adding SKX kernels and configuration. --- config/skx/bli_cntx_init_skx.c | 78 +++ config/skx/bli_family_skx.h | 133 +++++ config/skx/make_defs.mk | 115 +++++ config_registry | 1 + frame/base/bli_arch.c | 3 + frame/base/bli_cpuid.c | 120 +++++ frame/base/bli_cpuid.h | 5 + frame/base/bli_gks.c | 5 + frame/include/bli_arch_config.h | 12 +- frame/include/bli_type_defs.h | 5 +- kernels/skx/3/bli_avx512_macros.h | 171 ++++++ kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c | 547 ++++++++++++++++++++ kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c | 572 +++++++++++++++++++++ kernels/skx/bli_kernels_skx.h | 40 ++ 14 files changed, 1802 insertions(+), 5 deletions(-) create mode 100644 config/skx/bli_cntx_init_skx.c create mode 100644 config/skx/bli_family_skx.h create mode 100644 config/skx/make_defs.mk create mode 100644 kernels/skx/3/bli_avx512_macros.h create mode 100644 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c create mode 100644 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c create mode 100644 kernels/skx/bli_kernels_skx.h diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c new file mode 100644 index 000000000..bc23295ac --- /dev/null +++ b/config/skx/bli_cntx_init_skx.c @@ -0,0 +1,78 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_cntx_init_skx( cntx_t* cntx ) +{ + blksz_t blkszs[ BLIS_NUM_BLKSZS ]; + + // Set default kernel blocksizes and functions. + bli_cntx_init_skx_ref( cntx ); + + // ------------------------------------------------------------------------- + + // Update the context with optimized native gemm micro-kernels and + // their storage preferences. + bli_cntx_set_l3_nat_ukrs + ( + 2, + BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE, + cntx + ); + + // Initialize level-3 blocksize objects with architecture-specific values. + // s d c z + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 ); + bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256, + 480, 480, 256, 256 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 ); + + // Update the context with the current architecture's register and cache + // blocksizes (and multiples) for native execution. + bli_cntx_set_blkszs + ( + BLIS_NAT, 5, + BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, + BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, + BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, + BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, + BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, + cntx + ); +} + diff --git a/config/skx/bli_family_skx.h b/config/skx/bli_family_skx.h new file mode 100644 index 000000000..13d0f788a --- /dev/null +++ b/config/skx/bli_family_skx.h @@ -0,0 +1,133 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +//#ifndef BLIS_FAMILY_H +//#define BLIS_FAMILY_H + +// -- THREADING PARAMETERS ----------------------------------------------------- + +#define BLIS_DEFAULT_M_THREAD_RATIO 3 +#define BLIS_DEFAULT_N_THREAD_RATIO 2 + +#define BLIS_DEFAULT_MR_THREAD_MAX 1 +#define BLIS_DEFAULT_NR_THREAD_MAX 4 + +// -- MEMORY ALLOCATION -------------------------------------------------------- + +#define BLIS_SIMD_ALIGN_SIZE 64 + +#define BLIS_SIMD_SIZE 64 +#define BLIS_SIMD_NUM_REGISTERS 32 + +#ifdef BLIS_NO_HBWMALLOC + +#include + +#define BLIS_MALLOC_POOL malloc +#define BLIS_FREE_POOL free + +#else + +#include + +#define BLIS_MALLOC_POOL hbw_malloc +#define BLIS_FREE_POOL hbw_free + +#endif + + +#if 0 +// -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- + +// -- Cache and register blocksizes -- + +// +// Constraints: +// +// (1) MC must be a multiple of: +// (a) MR (for zero-padding purposes) +// (b) NR (for zero-padding purposes when MR and NR are "swapped") +// (2) NC must be a multiple of +// (a) NR (for zero-padding purposes) +// (b) MR (for zero-padding purposes when MR and NR are "swapped") +// + +#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 +#define BLIS_DEFAULT_MC_D 144 +#define BLIS_DEFAULT_KC_D 336 +#define BLIS_DEFAULT_NC_D 5760 +#define BLIS_DEFAULT_MR_D 16 +#define BLIS_DEFAULT_NR_D 12 +#define BLIS_PACKDIM_MR_D 16 +#define BLIS_PACKDIM_NR_D 12 + +// NOTE: If the micro-kernel, which is typically unrolled to a factor +// of f, handles leftover edge cases (ie: when k % f > 0) then these +// register blocksizes in the k dimension can be defined to 1. + +//#define BLIS_DEFAULT_KR_S 1 +//#define BLIS_DEFAULT_KR_D 1 +//#define BLIS_DEFAULT_KR_C 1 +//#define BLIS_DEFAULT_KR_Z 1 + +// -- Maximum cache blocksizes (for optimizing edge cases) -- + +// NOTE: These cache blocksize "extensions" have the same constraints as +// the corresponding default blocksizes above. When these values are +// larger than the default blocksizes, blocksizes used at edge cases are +// enlarged if such an extension would encompass the remaining portion of +// the matrix dimension. + +#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) +#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) +#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) + +#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) +#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) +#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) + +//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) +//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) +//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) + +//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) +//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) +//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) + + +#endif + + +//#endif + diff --git a/config/skx/make_defs.mk b/config/skx/make_defs.mk new file mode 100644 index 000000000..4655c652d --- /dev/null +++ b/config/skx/make_defs.mk @@ -0,0 +1,115 @@ +# +# +# BLIS +# An object-based framework for developing high-performance BLAS-like +# libraries. +# +# Copyright (C) 2014, The University of Texas at Austin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# - Neither the name of The University of Texas at Austin nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + + +# Declare the name of the current configuration and add it to the +# running list of configurations included by common.mk. +THIS_CONFIG := skx +#CONFIGS_INCL += $(THIS_CONFIG) + +# +# --- Determine the C compiler and related flags --- +# + +ifeq ($(CC),) +CC := gcc +CC_VENDOR := gcc +endif + +# Enable IEEE Standard 1003.1-2004 (POSIX.1d). +# NOTE: This is needed to enable posix_memalign(). +CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L +CMISCFLAGS := -std=c99 -m64 +CPICFLAGS := -fPIC +CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors + +ifneq ($(DEBUG_TYPE),off) +CDBGFLAGS := -g +endif + +ifeq ($(DEBUG_TYPE),noopt) +COPTFLAGS := -O0 -DBLIS_NO_HBWMALLOC +else +COPTFLAGS := -O3 +endif + +ifeq ($(DEBUG_TYPE),sde) +CPPROCFLAGS += -DBLIS_NO_HBWMALLOC +endif + +CKOPTFLAGS := $(COPTFLAGS) + +ifeq ($(CC_VENDOR),gcc) +CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512 +else +ifeq ($(CC_VENDOR),icc) +CVECFLAGS := -xCORE-AVX512 +else +ifeq ($(CC_VENDOR),clang) +CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512 +else +$(error gcc, icc, or clang is required for this configuration.) +endif +endif +endif + +# The assembler on OS X won't recognize AVX512 without help +ifneq ($(CC_VENDOR),icc) +ifeq ($(OS_NAME),Darwin) +CVECFLAGS += -Wa,-march=skylake-avx512 +endif +endif + +# --- Determine the archiver and related flags --- +AR := ar +ARFLAGS := cr + +# --- Determine the linker and related flags --- +LINKER := $(CC) +SOFLAGS := -shared + +ifneq ($(DEBUG_TYPE),sde) +LDFLAGS := -lmemkind +else +LDFLAGS := +endif + +ifneq ($(CC_VENDOR),icc) +LDFLAGS += -lm +endif + +# Store all of the variables here to new variables containing the +# configuration name. +$(eval $(call store-make-defs,$(THIS_CONFIG))) + diff --git a/config_registry b/config_registry index 780f39455..70667b860 100644 --- a/config_registry +++ b/config_registry @@ -18,6 +18,7 @@ haswell: haswell sandybridge: sandybridge penryn: penryn knl: knl +skx: skx # AMD architectures. zen: zen/haswell diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c index d8e33672c..a5ef20191 100644 --- a/frame/base/bli_arch.c +++ b/frame/base/bli_arch.c @@ -48,6 +48,9 @@ arch_t bli_arch_query_id( void ) #endif // Intel microarchitectures. +#ifdef BLIS_FAMILY_SKX + id = BLIS_ARCH_SKX; +#endif #ifdef BLIS_FAMILY_KNL id = BLIS_ARCH_KNL; #endif diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c index a9d77d1fb..d3d8a47bd 100644 --- a/frame/base/bli_cpuid.c +++ b/frame/base/bli_cpuid.c @@ -47,6 +47,10 @@ arch_t bli_cpuid_query_id( void ) { // Check for each Intel configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. +#ifdef BLIS_CONFIG_SKX + if ( bli_cpuid_is_skx( family, model, features ) ) + return BLIS_ARCH_SKX; +#endif #ifdef BLIS_CONFIG_KNL if ( bli_cpuid_is_knl( family, model, features ) ) return BLIS_ARCH_KNL; @@ -65,6 +69,8 @@ arch_t bli_cpuid_query_id( void ) #endif // If none of the other sub-configurations were detected, return // the 'generic' arch_t id value. + printf("generic\n"); + return BLIS_ARCH_GENERIC; } else if ( vendor == VENDOR_AMD ) @@ -105,6 +111,31 @@ arch_t bli_cpuid_query_id( void ) } // ----------------------------------------------------------------------------- +bool_t bli_cpuid_is_skx + ( + uint32_t family, + uint32_t model, + uint32_t features + ) +{ + // Check for expected CPU features. + const uint32_t expected = FEATURE_AVX | + FEATURE_FMA3 | + FEATURE_AVX2 | + FEATURE_AVX512F | + FEATURE_AVX512DQ | + FEATURE_AVX512BW | + FEATURE_AVX512VL ; + + + int nvpu = vpu_count(); + + if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) { + return FALSE; + } + + return TRUE; +} bool_t bli_cpuid_is_knl ( @@ -629,6 +660,95 @@ uint32_t bli_cpuid_query return VENDOR_UNKNOWN; } +void get_cpu_name(char *cpu_name) +{ + uint32_t eax, ebx, ecx, edx; + + __cpuid(0x80000002u, eax, ebx, ecx, edx); + //printf("%x %x %x %x\n", eax, ebx, ecx, edx); + + *(uint32_t *)&cpu_name[0] = eax; + *(uint32_t *)&cpu_name[4] = ebx; + *(uint32_t *)&cpu_name[8] = ecx; + *(uint32_t *)&cpu_name[12] = edx; + + __cpuid(0x80000003u, eax, ebx, ecx, edx); + //printf("%x %x %x %x\n", eax, ebx, ecx, edx); + + *(uint32_t *)&cpu_name[16+0] = eax; + *(uint32_t *)&cpu_name[16+4] = ebx; + *(uint32_t *)&cpu_name[16+8] = ecx; + *(uint32_t *)&cpu_name[16+12] = edx; + + __cpuid(0x80000004u, eax, ebx, ecx, edx); + //printf("%x %x %x %x\n", eax, ebx, ecx, edx); + + *(uint32_t *)&cpu_name[32+0] = eax; + *(uint32_t *)&cpu_name[32+4] = ebx; + *(uint32_t *)&cpu_name[32+8] = ecx; + *(uint32_t *)&cpu_name[32+12] = edx; + +} + +int vpu_count() +{ + char cpu_name[48] = {}; + char *loc; + char model_num[5]; + int sku; + + get_cpu_name(cpu_name); + + if (strstr(cpu_name, "Intel(R) Xeon(R)") != NULL) + { + loc = strstr(cpu_name, "Platinum"); + if (loc == NULL) + loc = strstr(cpu_name, "Gold"); + if (loc == NULL) + loc = strstr(cpu_name, "Silver"); + if (loc == NULL) + loc = strstr(cpu_name, "Bronze"); + if (loc == NULL) + loc = strstr(cpu_name, "W"); + if (loc == NULL) + return -1; + + loc = strstr(loc+1," "); + if(loc == NULL) + return -1; + + strncpy(model_num, loc+1, 4); + model_num[5] = '\0'; + + sku = atoi(model_num); + + if (8199 >= sku && sku >= 8100) return 2; + else if (6199 >= sku && sku >= 6100) return 2; + else if (sku == 5122) return 2; + else if (5199 >= sku && sku >= 5100) return 1; + else if (4199 >= sku && sku >= 4100) return 1; + else if (3199 >= sku && sku >= 3100) return 1; + else if (2199 >= sku && sku >= 2120) return 2; + else if (2119 >= sku && sku >= 2100) return 1; + else return -1; + } + else if (strstr(cpu_name, "Intel(R) Core(TM) i9") != NULL) + { + return 1; + } + else if (strstr(cpu_name, "Intel(R) Core(TM) i7") != NULL) + { + if (strstr(cpu_name, "7800X") != NULL || + strstr(cpu_name, "7820X") != NULL) + return 1; + else return -1; + } + else + { + return -1; + } +} + #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) int get_cpu_type( int* model, int* part, int* features ) diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h index a9c99fef4..6cdc02387 100644 --- a/frame/base/bli_cpuid.h +++ b/frame/base/bli_cpuid.h @@ -37,6 +37,7 @@ arch_t bli_cpuid_query_id( void ); +bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); @@ -100,6 +101,10 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want ) #include "cpuid.h" +void get_cpu_name(char *cpu_name); +int vpu_count(); + + enum { VENDOR_INTEL, diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c index 2ae0223de..e1c0ed23f 100644 --- a/frame/base/bli_gks.c +++ b/frame/base/bli_gks.c @@ -71,6 +71,11 @@ void bli_gks_init( void ) // bli_config.h. // Intel architectures +#ifdef BLIS_CONFIG_SKX + bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx, + bli_cntx_init_skx_ref, + bli_cntx_init_skx_ind ); +#endif #ifdef BLIS_CONFIG_KNL bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl, bli_cntx_init_knl_ref, diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 339cb1f1c..65dad0d71 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -41,7 +41,9 @@ // // -- Intel64 architectures -- - +#ifdef BLIS_CONFIG_SKX +CNTX_INIT_PROTS( skx ) +#endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif @@ -121,7 +123,9 @@ CNTX_INIT_PROTS( generic ) #endif // -- Intel64 architectures -- - +#ifdef BLIS_FAMILY_SKX +#include "bli_family_skx.h" +#endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" #endif @@ -189,7 +193,9 @@ CNTX_INIT_PROTS( generic ) // // -- Intel64 architectures -- - +#ifdef BLIS_KERNELS_SKX +#include "bli_kernels_skx.h" +#endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" #endif diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index c15b0ed17..d6c1be0d7 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -815,7 +815,8 @@ typedef enum typedef enum { // Intel - BLIS_ARCH_KNL = 0, + BLIS_ARCH_SKX =0, + BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, @@ -842,7 +843,7 @@ typedef enum } arch_t; -#define BLIS_NUM_ARCHS 16 +#define BLIS_NUM_ARCHS 17 // diff --git a/kernels/skx/3/bli_avx512_macros.h b/kernels/skx/3/bli_avx512_macros.h new file mode 100644 index 000000000..2dfe6c6f6 --- /dev/null +++ b/kernels/skx/3/bli_avx512_macros.h @@ -0,0 +1,171 @@ +#ifndef BLIS_AVX512_MACROS_H +#define BLIS_AVX512_MACROS_H + +// +// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful +// + +#define COMMENT_BEGIN "#" +#define COMMENT_END + +#define STRINGIFY(...) #__VA_ARGS__ +#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t" +#define LABEL(label) STRINGIFY(label) ":\n\t" + +#define XMM(x) %%xmm##x +#define YMM(x) %%ymm##x +#define ZMM(x) %%zmm##x +#define EAX %%eax +#define EBX %%ebx +#define ECX %%ecx +#define EDX %%edx +#define EBP %%ebp +#define EDI %%edi +#define ESI %%esi +#define RAX %%rax +#define RBX %%rbx +#define RCX %%rcx +#define RDX %%rdx +#define RBP %%rbp +#define RDI %%rdi +#define RSI %%rsi +#define K(x) %%k##x +#define R(x) %%r##x +#define R8 %%r8 +#define R9 %%r9 +#define R10 %%r10 +#define R11 %%r11 +#define R12 %%r12 +#define R13 %%r13 +#define R14 %%r14 +#define R15 %%r15 +#define RD(x) %%r##x##d +#define R8D %%r8d +#define R9D %%r9d +#define R10D %%r10d +#define R11D %%r11d +#define R12D %%r12d +#define R13D %%r13d +#define R14D %%r14d +#define R15D %%r15d +#define IMM(x) $##x +#define VAR(x) %[x] + +#define MEM_4(reg,off,scale,disp) disp(reg,off,scale) +#define MEM_3(reg,off,scale) (reg,off,scale) +#define MEM_2(reg,disp) disp(reg) +#define MEM_1(reg) (reg) + +#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%} +#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%} +#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%} +#define MEM_1TO8_1(reg) MEM(reg) %{1to8%} + +#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%} +#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%} +#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%} +#define MEM_1TO16_1(reg) MEM(reg) %{1to16%} + +#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME +#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__) +#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__) +#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__) + +#define MASK_K(n) %{%%k##n%} +#define MASK_KZ(n) %{%%k##n%}%{z%} +#define KMOV(to,from) ASM(kmovw from, to) +#define JKNZD(kreg,label) \ + ASM(kortestw kreg, kreg) \ + ASM(jnz label) +#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0) +#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0) + +#define ALIGN16 ASM(.p2align 4) +#define ALIGN32 ASM(.p2align 5) +#define RDTSC ASM(rdstc) +#define MOV(_0, _1) ASM(mov _1, _0) +#define MOVD(_0, _1) ASM(movd _1, _0) +#define MOVL(_0, _1) ASM(movl _1, _0) +#define MOVQ(_0, _1) ASM(movq _1, _0) +#define VMOVD(_0, _1) ASM(vmovd _1, _0) +#define VMOVQ(_0, _1) ASM(vmovq _1, _0) +#define CMP(_0, _1) ASM(cmp _1, _0) +#define AND(_0, _1) ASM(and _1, _0) +#define ADD(_0, _1) ASM(add _1, _0) +#define SUB(_0, _1) ASM(sub _1, _0) +#define SAL(_0, _1) ASM(sal _1, _0) +#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0) +#define SAR(_0, _1) ASM(sar _1, _0) +#define SAL1(_0) ASM(sal _0) +#define SAR1(_0) ASM(sar _0) +#define LEA(_0, _1) ASM(lea _1, _0) +#define TEST(_0, _1) ASM(test _1, _0) +#define DEC(_0) ASM(dec _0) +#define JLE(_0) ASM(jle _0) +#define JL(_0) ASM(jl _0) +#define JNZ(_0) ASM(jnz _0) +#define JZ(_0) ASM(jz _0) +#define JNE(_0) ASM(jne _0) +#define JE(_0) ASM(je _0) +#define JNC(_0) ASM(jnc _0) +#define JC(_0) ASM(jc _0) +#define JMP(_0) ASM(jmp _0) +#define VCOMISS(_0, _1) ASM(vcomiss _1, _0) +#define VCOMISD(_0, _1) ASM(vcomisd _1, _0) +#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0) +#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0) +#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0) +#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0) +#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0) +#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0) +#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0) +#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0) +#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0) +#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0) +#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0) +#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0) +#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0) +#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0) +#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0) +#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0) +#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0) +#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0) +#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0) +#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0) +#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0) +#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0) +#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0) +#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0) +#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0) +#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0) +#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0) +#define VMOVSS(_0, _1) ASM(vmovss _1, _0) +#define VMOVSD(_0, _1) ASM(vmovsd _1, _0) +#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0) +#define VMOVUPS(_0, _1) ASM(vmovups _1, _0) +#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0) +#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0) +#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0) +#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0) +#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0) +#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0) +#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0) +#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0) +#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0) +#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0) +#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0) +#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0) +#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0) +#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0) +#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0) +#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0) +#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS) +#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS) +#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS) +#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS) +#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS) +#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS) +#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS) +#define VZEROUPPER() ASM(vzeroupper) + +#endif diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c new file mode 100644 index 000000000..0e705f763 --- /dev/null +++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c @@ -0,0 +1,547 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" +#include + +#include "bli_avx512_macros.h" + +#define A_L1_PREFETCH_DIST 4 //should be multiple of 2 + +/*The pointer of B is moved ahead by one iteration of k +before the loop starts.Therefore, prefetching 3 k iterations +ahead*/ +#define B_L1_PREFETCH_DIST 4 + +#define TAIL_NITER 8 + +#define CACHELINE_SIZE 64 //size of cache line in bytes + +/* During each subiteration, prefetching 2 cache lines of B + * UNROLL factor ahead. 2cache lines = 16 doubles (NR). + * */ +#define PREFETCH_A_L1(n, k) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k) * CACHELINE_SIZE)) + +/* Preloading B for the first iteration of the main loop. + * for subiter(1), subiter(2), and subiter(3) */ +#define PREFETCH_B_L1_1ITER \ + PREFETCH(0, MEM(RBX )) \ + PREFETCH(0, MEM(RBX, CACHELINE_SIZE)) \ + PREFETCH(0, MEM(RBX, 2*CACHELINE_SIZE)) \ + PREFETCH(0, MEM(RBX, 3*CACHELINE_SIZE)) \ + PREFETCH(0, MEM(RBX, 4*CACHELINE_SIZE)) \ + PREFETCH(0, MEM(RBX, 5*CACHELINE_SIZE)) + +#define LOOP_ALIGN ALIGN16 + +#define UPDATE_C(R1,R2,R3,R4) \ +\ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \ + VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \ + VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \ + VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \ + VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \ + VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \ + VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ + VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,2)) + +#define UPDATE_C_BZ(R1,R2,R3,R4) \ +\ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \ + VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \ + VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ + VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,2)) + +#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \ + VFMADD231PD(ZMM(R1), ZMM(6), ZMM(1)) \ + VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \ + VFMADD231PD(ZMM(R2), ZMM(6), ZMM(1)) \ + VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R2)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \ + VFMADD231PD(ZMM(R3), ZMM(6), ZMM(1)) \ + VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R3)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \ + VFMADD231PD(ZMM(R4), ZMM(6), ZMM(1)) \ + VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R4)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) + +#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \ +\ + KXNORW(K(1), K(0), K(0)) \ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R2)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R3)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R4)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) + +#ifdef PREFETCH_C_L2 +#undef PREFETCH_C_L2 +#define PREFETCH_C_L2 \ +\ + PREFETCH(1, MEM(RCX, 0*64)) \ + PREFETCH(1, MEM(RCX, 1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,1,0*64)) \ + PREFETCH(1, MEM(RCX,R12,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,2,0*64)) \ + PREFETCH(1, MEM(RCX,R12,2,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R13,1,0*64)) \ + PREFETCH(1, MEM(RCX,R13,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,4,0*64)) \ + PREFETCH(1, MEM(RCX,R12,4,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R14,1,0*64)) \ + PREFETCH(1, MEM(RCX,R14,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R13,2,0*64)) \ + PREFETCH(1, MEM(RCX,R13,2,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R15,1,0*64)) \ + PREFETCH(1, MEM(RCX,R15,1,1*64)) \ + \ + PREFETCH(1, MEM(RDX, 0*64)) \ + PREFETCH(1, MEM(RDX, 1*64)) \ + \ + PREFETCH(1, MEM(RDX,R12,1,0*64)) \ + PREFETCH(1, MEM(RDX,R12,1,1*64)) \ + \ + PREFETCH(1, MEM(RDX,R12,2,0*64)) \ + PREFETCH(1, MEM(RDX,R12,2,1*64)) \ + \ + PREFETCH(1, MEM(RDX,R13,1,0*64)) \ + PREFETCH(1, MEM(RDX,R13,1,1*64)) + +#else +#undef PREFETCH_C_L2 +#define PREFETCH_C_L2 +#endif + + +#define PREFETCH_C_L1 \ +\ + PREFETCHW0(MEM(RCX, 0*64)) \ + PREFETCHW0(MEM(RCX, 1*64)) \ + PREFETCHW0(MEM(RCX,R12,1,0*64)) \ + PREFETCHW0(MEM(RCX,R12,1,1*64)) \ + PREFETCHW0(MEM(RCX,R12,2,0*64)) \ + PREFETCHW0(MEM(RCX,R12,2,1*64)) \ + PREFETCHW0(MEM(RCX,R13,1,0*64)) \ + PREFETCHW0(MEM(RCX,R13,1,1*64)) \ + PREFETCHW0(MEM(RCX,R12,4,0*64)) \ + PREFETCHW0(MEM(RCX,R12,4,1*64)) \ + PREFETCHW0(MEM(RCX,R14,1,0*64)) \ + PREFETCHW0(MEM(RCX,R14,1,1*64)) \ + PREFETCHW0(MEM(RCX,R13,2,0*64)) \ + PREFETCHW0(MEM(RCX,R13,2,1*64)) \ + PREFETCHW0(MEM(RCX,R15,1,0*64)) \ + PREFETCHW0(MEM(RCX,R15,1,1*64)) \ + PREFETCHW0(MEM(RDX, 0*64)) \ + PREFETCHW0(MEM(RDX, 1*64)) \ + PREFETCHW0(MEM(RDX,R12,1,0*64)) \ + PREFETCHW0(MEM(RDX,R12,1,1*64)) \ + PREFETCHW0(MEM(RDX,R12,2,0*64)) \ + PREFETCHW0(MEM(RDX,R12,2,1*64)) \ + PREFETCHW0(MEM(RDX,R13,1,0*64)) \ + PREFETCHW0(MEM(RDX,R13,1,1*64)) + +// +// n: index in unrolled loop +// +// a: ZMM register to load into +// b: ZMM register to read from +// +// ...: addressing for A, except for offset +// +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n, 0) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \ + VFMADD231PD(ZMM( 8), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM( 9), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(10), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(11), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \ + VFMADD231PD(ZMM(12), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(13), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \ + VFMADD231PD(ZMM(16), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(17), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(18), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(19), ZMM(1), ZMM(4)) \ + \ + PREFETCH_A_L1(n, 1) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \ + VFMADD231PD(ZMM(20), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(21), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(22), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(23), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \ + VFMADD231PD(ZMM(24), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(25), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+10)*8)) \ + VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+11)*8)) \ + VFMADD231PD(ZMM(28), ZMM(0), ZMM(3)) \ + VFMADD231PD(ZMM(29), ZMM(1), ZMM(3)) \ + VFMADD231PD(ZMM(30), ZMM(0), ZMM(4)) \ + VFMADD231PD(ZMM(31), ZMM(1), ZMM(4)) \ + \ + VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \ + VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8)) + +//This is an array used for the scatter/gather instructions. +static int64_t offsets[16] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; + + +void bli_dgemm_skx_asm_16x12_l2( + dim_t k_, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) +{ + (void)data; + (void)cntx; + + const int64_t* offsetPtr = &offsets[0]; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; + + __asm__ volatile + ( + + VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers + VMOVAPD(YMM( 7), YMM(8)) + VMOVAPD(YMM( 9), YMM(8)) + VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index + VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a + VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b + VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c + VMOVAPD(YMM(14), YMM(8)) + VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a + VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a + VMOVAPD(YMM(17), YMM(8)) + VMOVAPD(YMM(18), YMM(8)) + VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c + VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3 + VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5 + VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7 + VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c + VMOVAPD(YMM(24), YMM(8)) + VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(16*8)) //mr*sizeof(double) + VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*8)) //nr*sizeof(double) + VMOVAPD(YMM(27), YMM(8)) + VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load + VMOVAPD(YMM(29), YMM(8)) + VMOVAPD(YMM(30), YMM(8)) + VMOVAPD(YMM(31), YMM(8)) + + TEST(RSI, RSI) + JZ(POSTACCUM) + +#ifdef PREFETCH_A_BEFORE + PREFETCH(0, MEM(RAX,0*64)) + PREFETCH(0, MEM(RAX,1*64)) + PREFETCH(0, MEM(RAX,2*64)) + PREFETCH(0, MEM(RAX,3*64)) + PREFETCH(0, MEM(RAX,4*64)) + PREFETCH(0, MEM(RAX,5*64)) + PREFETCH(0, MEM(RAX,6*64)) + PREFETCH(0, MEM(RAX,7*64)) + +#endif + +#ifdef PREFETCH_B_BEFORE + PREFETCH(0, MEM(RBX,0*64)) + PREFETCH(0, MEM(RBX,1*64)) + PREFETCH(0, MEM(RBX,2*64)) + PREFETCH(0, MEM(RBX,3*64)) + PREFETCH(0, MEM(RBX,4*64)) + PREFETCH(0, MEM(RBX,5*64)) +#endif + + PREFETCH_C_L2 + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + + SUB(RDI, IMM(0+TAIL_NITER)) + JLE(K_SMALL) + + LOOP_ALIGN + LABEL(MAIN_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) + SUBITER(0) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128)) + SUBITER(1) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192)) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256)) + SUBITER(2) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320)) + SUBITER(3) + + LEA(RAX, MEM(RAX,R8,4)) + LEA(RBX, MEM(RBX,R9,4)) + + DEC(RDI) + + JNZ(MAIN_LOOP) + + LABEL(K_SMALL) + + PREFETCH_C_L1 + + ADD(RDI, IMM(0+TAIL_NITER)) + JZ(TAIL_LOOP) + + LOOP_ALIGN + LABEL(SMALL_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) + SUBITER(0) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128)) + SUBITER(1) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192)) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256)) + SUBITER(2) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320)) + SUBITER(3) + + LEA(RAX, MEM(RAX,R8,4)) + LEA(RBX, MEM(RBX,R9,4)) + + DEC(RDI) + + JNZ(SMALL_LOOP) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) + SUBITER(0) + + ADD(RAX, R8) + ADD(RBX, R9) + + DEC(RSI) + + JNZ(TAIL_LOOP) + + LABEL(POSTACCUM) + +#ifdef PREFETCH_A_AFTER + MOV(R8, VAR(a)) + PREFETCH(0, MEM(R8,0*64)) + PREFETCH(0, MEM(R8,1*64)) + PREFETCH(0, MEM(R8,2*64)) + PREFETCH(0, MEM(R8,3*64)) + PREFETCH(0, MEM(R8,4*64)) + PREFETCH(0, MEM(R8,5*64)) + PREFETCH(0, MEM(R8,6*64)) + PREFETCH(0, MEM(R8,7*64)) +#endif + +#ifdef PREFETCH_B_AFTER + MOV(R9, VAR(b)) + PREFETCH(0, MEM(R9,0*64)) + PREFETCH(0, MEM(R9,1*64)) + PREFETCH(0, MEM(R9,2*64)) + PREFETCH(0, MEM(R9,3*64)) + PREFETCH(0, MEM(R9,4*64)) + PREFETCH(0, MEM(R9,5*64)) +#endif + + MOV(RAX, VAR(alpha)) + MOV(RBX, VAR(beta)) + VBROADCASTSD(ZMM(0), MEM(RAX)) + VBROADCASTSD(ZMM(1), MEM(RBX)) + + MOV(RAX, VAR(cs_c)) + LEA(RAX, MEM(,RAX,8)) + MOV(RBX, VAR(rs_c)) + + // Check if C is column stride. If not, jump to the slow scattered update + CMP(RBX, IMM(1)) + JNE(SCATTEREDUPDATE) + + VCOMISD(XMM(1), XMM(7)) + JE(COLSTORBZ) + + UPDATE_C( 8, 9,10,11) + UPDATE_C(12,13,14,15) + UPDATE_C(16,17,18,19) + UPDATE_C(20,21,22,23) + UPDATE_C(24,25,26,27) + UPDATE_C(28,29,30,31) + + JMP(END) + LABEL(COLSTORBZ) + + UPDATE_C_BZ( 8, 9,10,11) + UPDATE_C_BZ(12,13,14,15) + UPDATE_C_BZ(16,17,18,19) + UPDATE_C_BZ(20,21,22,23) + UPDATE_C_BZ(24,25,26,27) + UPDATE_C_BZ(28,29,30,31) + + JMP(END) + LABEL(SCATTEREDUPDATE) + + MOV(RDI, VAR(offsetPtr)) + VMOVDQA64(ZMM(2), MEM(RDI,0*64)) + VMOVDQA64(ZMM(3), MEM(RDI,1*64)) + VPBROADCASTQ(ZMM(6), RBX) + VPMULLQ(ZMM(2), ZMM(6), ZMM(2)) + VPMULLQ(ZMM(3), ZMM(6), ZMM(3)) + + VCOMISD(XMM(1), XMM(7)) + JE(SCATTERBZ) + + UPDATE_C_ROW_SCATTERED( 8, 9,10,11) + UPDATE_C_ROW_SCATTERED(12,13,14,15) + UPDATE_C_ROW_SCATTERED(16,17,18,19) + UPDATE_C_ROW_SCATTERED(20,21,22,23) + UPDATE_C_ROW_SCATTERED(24,25,26,27) + UPDATE_C_ROW_SCATTERED(28,29,30,31) + + JMP(END) + LABEL(SCATTERBZ) + + UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11) + UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15) + UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19) + UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23) + UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27) + UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31) + + LABEL(END) + + VZEROUPPER() + + : // output operands + : // input operands + [k] "m" (k), + [a] "m" (a), + [b] "m" (b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [offsetPtr] "m" (offsetPtr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", + "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", "memory" + ); +} diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c new file mode 100644 index 000000000..a69b92086 --- /dev/null +++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c @@ -0,0 +1,572 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#include "bli_avx512_macros.h" + +#define CACHELINE_SIZE 64 //size of cache line in bytes + +#define A_L1_PREFETCH_DIST 4 //should be multiple of 2 + +/*The pointer of B is moved ahead by one iteration of k +before the loop starts.Therefore, prefetching 3 k iterations +ahead*/ +#define B_L1_PREFETCH_DIST 4 + +#define TAIL_NITER 8 + + +/* During each subiteration, prefetching 2 cache lines of B + * UNROLL factor ahead. 2cache lines = 32 floats (NR). + * */ +#define PREFETCH_A_L1(n, k) \ + PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*32*4 + (2*n+k) * CACHELINE_SIZE)) + +#define LOOP_ALIGN ALIGN16 + +#define UPDATE_C(R1,R2,R3,R4) \ +\ + VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ + VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \ + VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \ + VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \ + VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \ + VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \ + VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \ + VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ + VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,2)) + +#define UPDATE_C_BZ(R1,R2,R3,R4) \ +\ + VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ + VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ + VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ + VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ + VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \ + VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \ + VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ + VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ + LEA(RCX, MEM(RCX,RAX,2)) + +#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + KXNORW(K(3), K(0), K(0)) \ + KXNORW(K(4), K(0), K(0)) \ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \ + VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \ + VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \ + VFMADD231PS(YMM(R1), YMM(6), YMM(1)) \ + VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ + VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R1)) \ + VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + KXNORW(K(3), K(0), K(0)) \ + KXNORW(K(4), K(0), K(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \ + VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \ + VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \ + VFMADD231PS(YMM(R2), YMM(6), YMM(1)) \ + VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ + VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R2)) \ + VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ + LEA(RDX, MEM(RDX,RAX,1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + KXNORW(K(3), K(0), K(0)) \ + KXNORW(K(4), K(0), K(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \ + VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \ + VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \ + VFMADD231PS(YMM(R3), YMM(6), YMM(1)) \ + VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ + VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R3)) \ + VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + KXNORW(K(3), K(0), K(0)) \ + KXNORW(K(4), K(0), K(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \ + VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \ + VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \ + VFMADD231PS(YMM(R4), YMM(6), YMM(1)) \ + VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ + VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R4)) \ + VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ + LEA(RDX, MEM(RDX,RAX,1)) + +#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \ + VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R1)) \ + VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \ + VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R2)) \ + VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ + LEA(RDX, MEM(RDX,RAX,1)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \ + VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R3)) \ + VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \ +\ + KXNORW(K(1), K(0), K(0)) \ + KXNORW(K(2), K(0), K(0)) \ + VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ + VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \ + VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R4)) \ + VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \ +\ + LEA(RCX, MEM(RCX,RAX,1)) \ + LEA(RDX, MEM(RDX,RAX,1)) + +#ifdef PREFETCH_C_L2 +#undef PREFETCH_C_L2 +#define PREFETCH_C_L2 \ +\ + PREFETCH(1, MEM(RCX, 0*64)) \ + PREFETCH(1, MEM(RCX, 1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,1,0*64)) \ + PREFETCH(1, MEM(RCX,R12,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,2,0*64)) \ + PREFETCH(1, MEM(RCX,R12,2,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R13,1,0*64)) \ + PREFETCH(1, MEM(RCX,R13,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R12,4,0*64)) \ + PREFETCH(1, MEM(RCX,R12,4,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R14,1,0*64)) \ + PREFETCH(1, MEM(RCX,R14,1,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R13,2,0*64)) \ + PREFETCH(1, MEM(RCX,R13,2,1*64)) \ + \ + PREFETCH(1, MEM(RCX,R15,1,0*64)) \ + PREFETCH(1, MEM(RCX,R15,1,1*64)) \ + \ + PREFETCH(1, MEM(RDX, 0*64)) \ + PREFETCH(1, MEM(RDX, 1*64)) \ + \ + PREFETCH(1, MEM(RDX,R12,1,0*64)) \ + PREFETCH(1, MEM(RDX,R12,1,1*64)) \ + \ + PREFETCH(1, MEM(RDX,R12,2,0*64)) \ + PREFETCH(1, MEM(RDX,R12,2,1*64)) \ + \ + PREFETCH(1, MEM(RDX,R13,1,0*64)) \ + PREFETCH(1, MEM(RDX,R13,1,1*64)) + +#else +#undef PREFETCH_C_L2 +#define PREFETCH_C_L2 +#endif + + +#define PREFETCH_C_L1 \ +\ + PREFETCHW0(MEM(RCX, 0*64)) \ + PREFETCHW0(MEM(RCX, 1*64)) \ + PREFETCHW0(MEM(RCX,R12,1,0*64)) \ + PREFETCHW0(MEM(RCX,R12,1,1*64)) \ + PREFETCHW0(MEM(RCX,R12,2,0*64)) \ + PREFETCHW0(MEM(RCX,R12,2,1*64)) \ + PREFETCHW0(MEM(RCX,R13,1,0*64)) \ + PREFETCHW0(MEM(RCX,R13,1,1*64)) \ + PREFETCHW0(MEM(RCX,R12,4,0*64)) \ + PREFETCHW0(MEM(RCX,R12,4,1*64)) \ + PREFETCHW0(MEM(RCX,R14,1,0*64)) \ + PREFETCHW0(MEM(RCX,R14,1,1*64)) \ + PREFETCHW0(MEM(RCX,R13,2,0*64)) \ + PREFETCHW0(MEM(RCX,R13,2,1*64)) \ + PREFETCHW0(MEM(RCX,R15,1,0*64)) \ + PREFETCHW0(MEM(RCX,R15,1,1*64)) \ + PREFETCHW0(MEM(RDX, 0*64)) \ + PREFETCHW0(MEM(RDX, 1*64)) \ + PREFETCHW0(MEM(RDX,R12,1,0*64)) \ + PREFETCHW0(MEM(RDX,R12,1,1*64)) \ + PREFETCHW0(MEM(RDX,R12,2,0*64)) \ + PREFETCHW0(MEM(RDX,R12,2,1*64)) \ + PREFETCHW0(MEM(RDX,R13,1,0*64)) \ + PREFETCHW0(MEM(RDX,R13,1,1*64)) + +// +// n: index in unrolled loop +// +// a: ZMM register to load into +// b: ZMM register to read from +// +// ...: addressing for B, except for offset +// +#define SUBITER(n) \ +\ + PREFETCH_A_L1(n, 0) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \ + VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \ + VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \ + VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \ + \ + PREFETCH_A_L1(n, 1) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \ + VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \ + VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \ + \ + VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+10)*8)) \ + VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+11)*8)) \ + VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \ + VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \ + VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \ + VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \ + \ + VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \ + VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8)) + +//This is an array used for the scatter/gather instructions. +static int64_t offsets[16] __attribute__((aligned(64))) = + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; + +void bli_sgemm_skx_asm_32x12_l2( + dim_t k_, + float* restrict alpha, + float* restrict a, + float* restrict b, + float* restrict beta, + float* restrict c, inc_t rs_c_, inc_t cs_c_, + auxinfo_t* data, + cntx_t* restrict cntx + ) +{ + (void)data; + (void)cntx; + + const int64_t* offsetPtr = &offsets[0]; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; + + __asm__ volatile + ( + + VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers + VMOVAPD(YMM( 7), YMM(8)) + VMOVAPD(YMM( 9), YMM(8)) + VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index + VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a + VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b + VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c + VMOVAPD(YMM(14), YMM(8)) + VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*4)) //pre-load a + VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 16*4)) //pre-load a + VMOVAPD(YMM(17), YMM(8)) + VMOVAPD(YMM(18), YMM(8)) + VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c + VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3 + VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5 + VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7 + VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c + VMOVAPD(YMM(24), YMM(8)) + VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(32*4)) //mr*sizeof(float) + VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*4)) //nr*sizeof(float) + VMOVAPD(YMM(27), YMM(8)) + VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load + VMOVAPD(YMM(29), YMM(8)) + VMOVAPD(YMM(30), YMM(8)) + VMOVAPD(YMM(31), YMM(8)) + + TEST(RSI, RSI) + JZ(POSTACCUM) + +#ifdef PREFETCH_A_BEFORE + /* Prefetching 8 cachlines of A (4 iterations worth of data + (32 (MR) x4 (sizeof(float)) x4 iter /64 = 8 cachelines) */ + PREFETCH(0, MEM(RAX,0*64)) + PREFETCH(0, MEM(RAX,1*64)) + PREFETCH(0, MEM(RAX,2*64)) + PREFETCH(0, MEM(RAX,3*64)) + PREFETCH(0, MEM(RAX,4*64)) + PREFETCH(0, MEM(RAX,5*64)) + PREFETCH(0, MEM(RAX,6*64)) + PREFETCH(0, MEM(RAX,7*64)) +#endif + +#ifdef PREFETCH_B_BEFORE + /* Prefetching 3 cachlines of B (4 iterations worth of data + (12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */ + PREFETCH(0, MEM(RBX,0*64)) + PREFETCH(0, MEM(RBX,1*64)) + PREFETCH(0, MEM(RBX,2*64)) +#endif + + PREFETCH_C_L2 + + MOV(RDI, RSI) + AND(RSI, IMM(3)) + SAR(RDI, IMM(2)) + + SUB(RDI, IMM(0+TAIL_NITER)) + JLE(K_SMALL) + + LOOP_ALIGN + LABEL(MAIN_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) + SUBITER(0) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64)) + SUBITER(1) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,R8,4)) + LEA(RBX, MEM(RBX,R9,4)) + + DEC(RDI) + + JNZ(MAIN_LOOP) + + LABEL(K_SMALL) + + PREFETCH_C_L1 + + ADD(RDI, IMM(0+TAIL_NITER)) + JZ(TAIL_LOOP) + + LOOP_ALIGN + LABEL(SMALL_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) + SUBITER(0) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64)) + SUBITER(1) + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128)) + SUBITER(2) + SUBITER(3) + + LEA(RAX, MEM(RAX,R8,4)) + LEA(RBX, MEM(RBX,R9,4)) + + DEC(RDI) + + JNZ(SMALL_LOOP) + + TEST(RSI, RSI) + JZ(POSTACCUM) + + LOOP_ALIGN + LABEL(TAIL_LOOP) + + PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) + SUBITER(0) + + ADD(RAX, R8) + ADD(RBX, R9) + + DEC(RSI) + + JNZ(TAIL_LOOP) + + + LABEL(POSTACCUM) + +#ifdef PREFETCH_A_AFTER + MOV(R8, VAR(a)) + PREFETCH(0, MEM(R8,0*64)) + PREFETCH(0, MEM(R8,1*64)) + PREFETCH(0, MEM(R8,2*64)) + PREFETCH(0, MEM(R8,3*64)) + PREFETCH(0, MEM(R8,4*64)) + PREFETCH(0, MEM(R8,5*64)) + PREFETCH(0, MEM(R8,6*64)) + PREFETCH(0, MEM(R8,7*64)) +#endif + +#ifdef PREFETCH_B_AFTER + MOV(R9, VAR(b)) + PREFETCH(0, MEM(R9,0*64)) + PREFETCH(0, MEM(R9,1*64)) + PREFETCH(0, MEM(R9,2*64)) +#endif + + MOV(RAX, VAR(alpha)) + MOV(RBX, VAR(beta)) + VBROADCASTSS(ZMM(0), MEM(RAX)) + VBROADCASTSS(ZMM(1), MEM(RBX)) + + MOV(RAX, VAR(cs_c)) + LEA(RAX, MEM(,RAX,4)) + MOV(RBX, VAR(rs_c)) + LEA(RBX, MEM(,RBX,4)) + + + // Check if C is column major (rs_c = 1). If not, jump to the slow scattered update + CMP(RBX, IMM(4)) + JNE(SCATTEREDUPDATE) + + VCOMISD(XMM(1), XMM(7)) + JE(COLSTORBZ) + + UPDATE_C( 8, 9,10,11) + UPDATE_C(12,13,14,15) + UPDATE_C(16,17,18,19) + UPDATE_C(20,21,22,23) + UPDATE_C(24,25,26,27) + UPDATE_C(28,29,30,31) + + JMP(END) + LABEL(COLSTORBZ) + + UPDATE_C_BZ( 8, 9,10,11) + UPDATE_C_BZ(12,13,14,15) + UPDATE_C_BZ(16,17,18,19) + UPDATE_C_BZ(20,21,22,23) + UPDATE_C_BZ(24,25,26,27) + UPDATE_C_BZ(28,29,30,31) + + JMP(END) + LABEL(SCATTEREDUPDATE) + + LEA(RDX, MEM(RCX,RBX,8)) + LEA(RDX, MEM(RDX,RBX,8)) + + MOV(RDI, VAR(offsetPtr)) + VMOVDQA64(ZMM(2), MEM(RDI,0*64)) + VMOVDQA64(ZMM(3), MEM(RDI,1*64)) + VPBROADCASTQ(ZMM(6), RBX) + VPMULLQ(ZMM(2), ZMM(6), ZMM(2)) + VPMULLQ(ZMM(3), ZMM(6), ZMM(3)) + + VCOMISD(XMM(1), XMM(7)) + JE(SCATTERBZ) + + UPDATE_C_ROW_SCATTERED( 8, 9,10,11) + UPDATE_C_ROW_SCATTERED(12,13,14,15) + UPDATE_C_ROW_SCATTERED(16,17,18,19) + UPDATE_C_ROW_SCATTERED(20,21,22,23) + UPDATE_C_ROW_SCATTERED(24,25,26,27) + UPDATE_C_ROW_SCATTERED(28,29,30,31) + + JMP(END) + LABEL(SCATTERBZ) + + UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11) + UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15) + UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19) + UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23) + UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27) + UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31) + + LABEL(END) + + VZEROUPPER() + + : // output operands + : // input operands + [k] "m" (k), + [a] "m" (a), + [b] "m" (b), + [alpha] "m" (alpha), + [beta] "m" (beta), + [c] "m" (c), + [rs_c] "m" (rs_c), + [cs_c] "m" (cs_c), + [offsetPtr] "m" (offsetPtr) + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", + "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", + "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", + "zmm30", "zmm31", "memory" + ); +} diff --git a/kernels/skx/bli_kernels_skx.h b/kernels/skx/bli_kernels_skx.h new file mode 100644 index 000000000..1217277e2 --- /dev/null +++ b/kernels/skx/bli_kernels_skx.h @@ -0,0 +1,40 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name of The University of Texas at Austin nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) +GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) + +GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) + +