Merge branch 'master' of github.com:flame/blis

This commit is contained in:
Field G. Van Zee
2017-12-07 17:36:44 -06:00
14 changed files with 1802 additions and 5 deletions

View File

@@ -0,0 +1,78 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_skx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_skx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256,
480, 480, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}

133
config/skx/bli_family_skx.h Normal file
View File

@@ -0,0 +1,133 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- THREADING PARAMETERS -----------------------------------------------------
#define BLIS_DEFAULT_M_THREAD_RATIO 3
#define BLIS_DEFAULT_N_THREAD_RATIO 2
#define BLIS_DEFAULT_MR_THREAD_MAX 1
#define BLIS_DEFAULT_NR_THREAD_MAX 4
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 64
#define BLIS_SIMD_SIZE 64
#define BLIS_SIMD_NUM_REGISTERS 32
#ifdef BLIS_NO_HBWMALLOC
#include <stdlib.h>
#define BLIS_MALLOC_POOL malloc
#define BLIS_FREE_POOL free
#else
#include <hbwmalloc.h>
#define BLIS_MALLOC_POOL hbw_malloc
#define BLIS_FREE_POOL hbw_free
#endif
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Cache and register blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2
#define BLIS_DEFAULT_MC_D 144
#define BLIS_DEFAULT_KC_D 336
#define BLIS_DEFAULT_NC_D 5760
#define BLIS_DEFAULT_MR_D 16
#define BLIS_DEFAULT_NR_D 12
#define BLIS_PACKDIM_MR_D 16
#define BLIS_PACKDIM_NR_D 12
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Maximum cache blocksizes (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// larger than the default blocksizes, blocksizes used at edge cases are
// enlarged if such an extension would encompass the remaining portion of
// the matrix dimension.
#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0)
#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0)
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
#endif
//#endif

115
config/skx/make_defs.mk Normal file
View File

@@ -0,0 +1,115 @@
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name of The University of Texas at Austin nor the names
# of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := skx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -m64
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0 -DBLIS_NO_HBWMALLOC
else
COPTFLAGS := -O3
endif
ifeq ($(DEBUG_TYPE),sde)
CPPROCFLAGS += -DBLIS_NO_HBWMALLOC
endif
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
else
ifeq ($(CC_VENDOR),icc)
CVECFLAGS := -xCORE-AVX512
else
ifeq ($(CC_VENDOR),clang)
CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# The assembler on OS X won't recognize AVX512 without help
ifneq ($(CC_VENDOR),icc)
ifeq ($(OS_NAME),Darwin)
CVECFLAGS += -Wa,-march=skylake-avx512
endif
endif
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
ifneq ($(DEBUG_TYPE),sde)
LDFLAGS := -lmemkind
else
LDFLAGS :=
endif
ifneq ($(CC_VENDOR),icc)
LDFLAGS += -lm
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))

View File

@@ -18,6 +18,7 @@ haswell: haswell
sandybridge: sandybridge
penryn: penryn
knl: knl
skx: skx
# AMD architectures.
zen: zen/haswell

View File

@@ -48,6 +48,9 @@ arch_t bli_arch_query_id( void )
#endif
// Intel microarchitectures.
#ifdef BLIS_FAMILY_SKX
id = BLIS_ARCH_SKX;
#endif
#ifdef BLIS_FAMILY_KNL
id = BLIS_ARCH_KNL;
#endif

View File

@@ -47,6 +47,10 @@ arch_t bli_cpuid_query_id( void )
{
// Check for each Intel configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_SKX
if ( bli_cpuid_is_skx( family, model, features ) )
return BLIS_ARCH_SKX;
#endif
#ifdef BLIS_CONFIG_KNL
if ( bli_cpuid_is_knl( family, model, features ) )
return BLIS_ARCH_KNL;
@@ -65,6 +69,8 @@ arch_t bli_cpuid_query_id( void )
#endif
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
printf("generic\n");
return BLIS_ARCH_GENERIC;
}
else if ( vendor == VENDOR_AMD )
@@ -105,6 +111,31 @@ arch_t bli_cpuid_query_id( void )
}
// -----------------------------------------------------------------------------
bool_t bli_cpuid_is_skx
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2 |
FEATURE_AVX512F |
FEATURE_AVX512DQ |
FEATURE_AVX512BW |
FEATURE_AVX512VL ;
int nvpu = vpu_count();
if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) {
return FALSE;
}
return TRUE;
}
bool_t bli_cpuid_is_knl
(
@@ -629,6 +660,95 @@ uint32_t bli_cpuid_query
return VENDOR_UNKNOWN;
}
void get_cpu_name(char *cpu_name)
{
uint32_t eax, ebx, ecx, edx;
__cpuid(0x80000002u, eax, ebx, ecx, edx);
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*(uint32_t *)&cpu_name[0] = eax;
*(uint32_t *)&cpu_name[4] = ebx;
*(uint32_t *)&cpu_name[8] = ecx;
*(uint32_t *)&cpu_name[12] = edx;
__cpuid(0x80000003u, eax, ebx, ecx, edx);
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*(uint32_t *)&cpu_name[16+0] = eax;
*(uint32_t *)&cpu_name[16+4] = ebx;
*(uint32_t *)&cpu_name[16+8] = ecx;
*(uint32_t *)&cpu_name[16+12] = edx;
__cpuid(0x80000004u, eax, ebx, ecx, edx);
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*(uint32_t *)&cpu_name[32+0] = eax;
*(uint32_t *)&cpu_name[32+4] = ebx;
*(uint32_t *)&cpu_name[32+8] = ecx;
*(uint32_t *)&cpu_name[32+12] = edx;
}
int vpu_count()
{
char cpu_name[48] = {};
char *loc;
char model_num[5];
int sku;
get_cpu_name(cpu_name);
if (strstr(cpu_name, "Intel(R) Xeon(R)") != NULL)
{
loc = strstr(cpu_name, "Platinum");
if (loc == NULL)
loc = strstr(cpu_name, "Gold");
if (loc == NULL)
loc = strstr(cpu_name, "Silver");
if (loc == NULL)
loc = strstr(cpu_name, "Bronze");
if (loc == NULL)
loc = strstr(cpu_name, "W");
if (loc == NULL)
return -1;
loc = strstr(loc+1," ");
if(loc == NULL)
return -1;
strncpy(model_num, loc+1, 4);
model_num[5] = '\0';
sku = atoi(model_num);
if (8199 >= sku && sku >= 8100) return 2;
else if (6199 >= sku && sku >= 6100) return 2;
else if (sku == 5122) return 2;
else if (5199 >= sku && sku >= 5100) return 1;
else if (4199 >= sku && sku >= 4100) return 1;
else if (3199 >= sku && sku >= 3100) return 1;
else if (2199 >= sku && sku >= 2120) return 2;
else if (2119 >= sku && sku >= 2100) return 1;
else return -1;
}
else if (strstr(cpu_name, "Intel(R) Core(TM) i9") != NULL)
{
return 1;
}
else if (strstr(cpu_name, "Intel(R) Core(TM) i7") != NULL)
{
if (strstr(cpu_name, "7800X") != NULL ||
strstr(cpu_name, "7820X") != NULL)
return 1;
else return -1;
}
else
{
return -1;
}
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
int get_cpu_type( int* model, int* part, int* features )

View File

@@ -37,6 +37,7 @@
arch_t bli_cpuid_query_id( void );
bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features );
bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features );
bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features );
bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features );
@@ -100,6 +101,10 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want )
#include "cpuid.h"
void get_cpu_name(char *cpu_name);
int vpu_count();
enum
{
VENDOR_INTEL,

View File

@@ -71,6 +71,11 @@ void bli_gks_init( void )
// bli_config.h.
// Intel architectures
#ifdef BLIS_CONFIG_SKX
bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx,
bli_cntx_init_skx_ref,
bli_cntx_init_skx_ind );
#endif
#ifdef BLIS_CONFIG_KNL
bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl,
bli_cntx_init_knl_ref,

View File

@@ -41,7 +41,9 @@
//
// -- Intel64 architectures --
#ifdef BLIS_CONFIG_SKX
CNTX_INIT_PROTS( skx )
#endif
#ifdef BLIS_CONFIG_KNL
CNTX_INIT_PROTS( knl )
#endif
@@ -121,7 +123,9 @@ CNTX_INIT_PROTS( generic )
#endif
// -- Intel64 architectures --
#ifdef BLIS_FAMILY_SKX
#include "bli_family_skx.h"
#endif
#ifdef BLIS_FAMILY_KNL
#include "bli_family_knl.h"
#endif
@@ -189,7 +193,9 @@ CNTX_INIT_PROTS( generic )
//
// -- Intel64 architectures --
#ifdef BLIS_KERNELS_SKX
#include "bli_kernels_skx.h"
#endif
#ifdef BLIS_KERNELS_KNL
#include "bli_kernels_knl.h"
#endif

View File

@@ -815,7 +815,8 @@ typedef enum
typedef enum
{
// Intel
BLIS_ARCH_KNL = 0,
BLIS_ARCH_SKX =0,
BLIS_ARCH_KNL,
BLIS_ARCH_KNC,
BLIS_ARCH_HASWELL,
BLIS_ARCH_SANDYBRIDGE,
@@ -842,7 +843,7 @@ typedef enum
} arch_t;
#define BLIS_NUM_ARCHS 16
#define BLIS_NUM_ARCHS 17
//

View File

@@ -0,0 +1,171 @@
#ifndef BLIS_AVX512_MACROS_H
#define BLIS_AVX512_MACROS_H
//
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
//
#define COMMENT_BEGIN "#"
#define COMMENT_END
#define STRINGIFY(...) #__VA_ARGS__
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
#define LABEL(label) STRINGIFY(label) ":\n\t"
#define XMM(x) %%xmm##x
#define YMM(x) %%ymm##x
#define ZMM(x) %%zmm##x
#define EAX %%eax
#define EBX %%ebx
#define ECX %%ecx
#define EDX %%edx
#define EBP %%ebp
#define EDI %%edi
#define ESI %%esi
#define RAX %%rax
#define RBX %%rbx
#define RCX %%rcx
#define RDX %%rdx
#define RBP %%rbp
#define RDI %%rdi
#define RSI %%rsi
#define K(x) %%k##x
#define R(x) %%r##x
#define R8 %%r8
#define R9 %%r9
#define R10 %%r10
#define R11 %%r11
#define R12 %%r12
#define R13 %%r13
#define R14 %%r14
#define R15 %%r15
#define RD(x) %%r##x##d
#define R8D %%r8d
#define R9D %%r9d
#define R10D %%r10d
#define R11D %%r11d
#define R12D %%r12d
#define R13D %%r13d
#define R14D %%r14d
#define R15D %%r15d
#define IMM(x) $##x
#define VAR(x) %[x]
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
#define MEM_3(reg,off,scale) (reg,off,scale)
#define MEM_2(reg,disp) disp(reg)
#define MEM_1(reg) (reg)
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
#define MASK_K(n) %{%%k##n%}
#define MASK_KZ(n) %{%%k##n%}%{z%}
#define KMOV(to,from) ASM(kmovw from, to)
#define JKNZD(kreg,label) \
ASM(kortestw kreg, kreg) \
ASM(jnz label)
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
#define ALIGN16 ASM(.p2align 4)
#define ALIGN32 ASM(.p2align 5)
#define RDTSC ASM(rdstc)
#define MOV(_0, _1) ASM(mov _1, _0)
#define MOVD(_0, _1) ASM(movd _1, _0)
#define MOVL(_0, _1) ASM(movl _1, _0)
#define MOVQ(_0, _1) ASM(movq _1, _0)
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
#define CMP(_0, _1) ASM(cmp _1, _0)
#define AND(_0, _1) ASM(and _1, _0)
#define ADD(_0, _1) ASM(add _1, _0)
#define SUB(_0, _1) ASM(sub _1, _0)
#define SAL(_0, _1) ASM(sal _1, _0)
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
#define SAR(_0, _1) ASM(sar _1, _0)
#define SAL1(_0) ASM(sal _0)
#define SAR1(_0) ASM(sar _0)
#define LEA(_0, _1) ASM(lea _1, _0)
#define TEST(_0, _1) ASM(test _1, _0)
#define DEC(_0) ASM(dec _0)
#define JLE(_0) ASM(jle _0)
#define JL(_0) ASM(jl _0)
#define JNZ(_0) ASM(jnz _0)
#define JZ(_0) ASM(jz _0)
#define JNE(_0) ASM(jne _0)
#define JE(_0) ASM(je _0)
#define JNC(_0) ASM(jnc _0)
#define JC(_0) ASM(jc _0)
#define JMP(_0) ASM(jmp _0)
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
#define VZEROUPPER() ASM(vzeroupper)
#endif

View File

@@ -0,0 +1,547 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include <assert.h>
#include "bli_avx512_macros.h"
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
/*The pointer of B is moved ahead by one iteration of k
before the loop starts.Therefore, prefetching 3 k iterations
ahead*/
#define B_L1_PREFETCH_DIST 4
#define TAIL_NITER 8
#define CACHELINE_SIZE 64 //size of cache line in bytes
/* During each subiteration, prefetching 2 cache lines of B
* UNROLL factor ahead. 2cache lines = 16 doubles (NR).
* */
#define PREFETCH_A_L1(n, k) \
PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k) * CACHELINE_SIZE))
/* Preloading B for the first iteration of the main loop.
* for subiter(1), subiter(2), and subiter(3) */
#define PREFETCH_B_L1_1ITER \
PREFETCH(0, MEM(RBX )) \
PREFETCH(0, MEM(RBX, CACHELINE_SIZE)) \
PREFETCH(0, MEM(RBX, 2*CACHELINE_SIZE)) \
PREFETCH(0, MEM(RBX, 3*CACHELINE_SIZE)) \
PREFETCH(0, MEM(RBX, 4*CACHELINE_SIZE)) \
PREFETCH(0, MEM(RBX, 5*CACHELINE_SIZE))
#define LOOP_ALIGN ALIGN16
#define UPDATE_C(R1,R2,R3,R4) \
\
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,2))
#define UPDATE_C_BZ(R1,R2,R3,R4) \
\
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,2))
#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
VFMADD231PD(ZMM(R1), ZMM(6), ZMM(1)) \
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R1)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
VFMADD231PD(ZMM(R2), ZMM(6), ZMM(1)) \
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R2)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
VFMADD231PD(ZMM(R3), ZMM(6), ZMM(1)) \
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R3)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
VFMADD231PD(ZMM(R4), ZMM(6), ZMM(1)) \
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R4)) \
\
LEA(RCX, MEM(RCX,RAX,1))
#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
\
KXNORW(K(1), K(0), K(0)) \
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R1)) \
\
KXNORW(K(1), K(0), K(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R2)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
\
KXNORW(K(1), K(0), K(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R3)) \
\
KXNORW(K(1), K(0), K(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R4)) \
\
LEA(RCX, MEM(RCX,RAX,1))
#ifdef PREFETCH_C_L2
#undef PREFETCH_C_L2
#define PREFETCH_C_L2 \
\
PREFETCH(1, MEM(RCX, 0*64)) \
PREFETCH(1, MEM(RCX, 1*64)) \
\
PREFETCH(1, MEM(RCX,R12,1,0*64)) \
PREFETCH(1, MEM(RCX,R12,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R12,2,0*64)) \
PREFETCH(1, MEM(RCX,R12,2,1*64)) \
\
PREFETCH(1, MEM(RCX,R13,1,0*64)) \
PREFETCH(1, MEM(RCX,R13,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R12,4,0*64)) \
PREFETCH(1, MEM(RCX,R12,4,1*64)) \
\
PREFETCH(1, MEM(RCX,R14,1,0*64)) \
PREFETCH(1, MEM(RCX,R14,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R13,2,0*64)) \
PREFETCH(1, MEM(RCX,R13,2,1*64)) \
\
PREFETCH(1, MEM(RCX,R15,1,0*64)) \
PREFETCH(1, MEM(RCX,R15,1,1*64)) \
\
PREFETCH(1, MEM(RDX, 0*64)) \
PREFETCH(1, MEM(RDX, 1*64)) \
\
PREFETCH(1, MEM(RDX,R12,1,0*64)) \
PREFETCH(1, MEM(RDX,R12,1,1*64)) \
\
PREFETCH(1, MEM(RDX,R12,2,0*64)) \
PREFETCH(1, MEM(RDX,R12,2,1*64)) \
\
PREFETCH(1, MEM(RDX,R13,1,0*64)) \
PREFETCH(1, MEM(RDX,R13,1,1*64))
#else
#undef PREFETCH_C_L2
#define PREFETCH_C_L2
#endif
#define PREFETCH_C_L1 \
\
PREFETCHW0(MEM(RCX, 0*64)) \
PREFETCHW0(MEM(RCX, 1*64)) \
PREFETCHW0(MEM(RCX,R12,1,0*64)) \
PREFETCHW0(MEM(RCX,R12,1,1*64)) \
PREFETCHW0(MEM(RCX,R12,2,0*64)) \
PREFETCHW0(MEM(RCX,R12,2,1*64)) \
PREFETCHW0(MEM(RCX,R13,1,0*64)) \
PREFETCHW0(MEM(RCX,R13,1,1*64)) \
PREFETCHW0(MEM(RCX,R12,4,0*64)) \
PREFETCHW0(MEM(RCX,R12,4,1*64)) \
PREFETCHW0(MEM(RCX,R14,1,0*64)) \
PREFETCHW0(MEM(RCX,R14,1,1*64)) \
PREFETCHW0(MEM(RCX,R13,2,0*64)) \
PREFETCHW0(MEM(RCX,R13,2,1*64)) \
PREFETCHW0(MEM(RCX,R15,1,0*64)) \
PREFETCHW0(MEM(RCX,R15,1,1*64)) \
PREFETCHW0(MEM(RDX, 0*64)) \
PREFETCHW0(MEM(RDX, 1*64)) \
PREFETCHW0(MEM(RDX,R12,1,0*64)) \
PREFETCHW0(MEM(RDX,R12,1,1*64)) \
PREFETCHW0(MEM(RDX,R12,2,0*64)) \
PREFETCHW0(MEM(RDX,R12,2,1*64)) \
PREFETCHW0(MEM(RDX,R13,1,0*64)) \
PREFETCHW0(MEM(RDX,R13,1,1*64))
//
// n: index in unrolled loop
//
// a: ZMM register to load into
// b: ZMM register to read from
//
// ...: addressing for A, except for offset
//
#define SUBITER(n) \
\
PREFETCH_A_L1(n, 0) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
VFMADD231PD(ZMM( 8), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM( 9), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(10), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(11), ZMM(1), ZMM(4)) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
VFMADD231PD(ZMM(12), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM(13), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
VFMADD231PD(ZMM(16), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM(17), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(18), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(19), ZMM(1), ZMM(4)) \
\
PREFETCH_A_L1(n, 1) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
VFMADD231PD(ZMM(20), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM(21), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(22), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(23), ZMM(1), ZMM(4)) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
VFMADD231PD(ZMM(24), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM(25), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \
\
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+10)*8)) \
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+11)*8)) \
VFMADD231PD(ZMM(28), ZMM(0), ZMM(3)) \
VFMADD231PD(ZMM(29), ZMM(1), ZMM(3)) \
VFMADD231PD(ZMM(30), ZMM(0), ZMM(4)) \
VFMADD231PD(ZMM(31), ZMM(1), ZMM(4)) \
\
VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
//This is an array used for the scatter/gather instructions.
static int64_t offsets[16] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
void bli_dgemm_skx_asm_16x12_l2(
dim_t k_,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;
const int64_t* offsetPtr = &offsets[0];
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
__asm__ volatile
(
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))
VMOVAPD(YMM( 9), YMM(8))
VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index
VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a
VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b
VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c
VMOVAPD(YMM(14), YMM(8))
VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a
VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a
VMOVAPD(YMM(17), YMM(8))
VMOVAPD(YMM(18), YMM(8))
VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c
VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3
VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5
VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7
VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
VMOVAPD(YMM(24), YMM(8))
VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(16*8)) //mr*sizeof(double)
VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*8)) //nr*sizeof(double)
VMOVAPD(YMM(27), YMM(8))
VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
VMOVAPD(YMM(29), YMM(8))
VMOVAPD(YMM(30), YMM(8))
VMOVAPD(YMM(31), YMM(8))
TEST(RSI, RSI)
JZ(POSTACCUM)
#ifdef PREFETCH_A_BEFORE
PREFETCH(0, MEM(RAX,0*64))
PREFETCH(0, MEM(RAX,1*64))
PREFETCH(0, MEM(RAX,2*64))
PREFETCH(0, MEM(RAX,3*64))
PREFETCH(0, MEM(RAX,4*64))
PREFETCH(0, MEM(RAX,5*64))
PREFETCH(0, MEM(RAX,6*64))
PREFETCH(0, MEM(RAX,7*64))
#endif
#ifdef PREFETCH_B_BEFORE
PREFETCH(0, MEM(RBX,0*64))
PREFETCH(0, MEM(RBX,1*64))
PREFETCH(0, MEM(RBX,2*64))
PREFETCH(0, MEM(RBX,3*64))
PREFETCH(0, MEM(RBX,4*64))
PREFETCH(0, MEM(RBX,5*64))
#endif
PREFETCH_C_L2
MOV(RDI, RSI)
AND(RSI, IMM(3))
SAR(RDI, IMM(2))
SUB(RDI, IMM(0+TAIL_NITER))
JLE(K_SMALL)
LOOP_ALIGN
LABEL(MAIN_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
SUBITER(0)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
SUBITER(1)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
SUBITER(2)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
SUBITER(3)
LEA(RAX, MEM(RAX,R8,4))
LEA(RBX, MEM(RBX,R9,4))
DEC(RDI)
JNZ(MAIN_LOOP)
LABEL(K_SMALL)
PREFETCH_C_L1
ADD(RDI, IMM(0+TAIL_NITER))
JZ(TAIL_LOOP)
LOOP_ALIGN
LABEL(SMALL_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
SUBITER(0)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
SUBITER(1)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
SUBITER(2)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
SUBITER(3)
LEA(RAX, MEM(RAX,R8,4))
LEA(RBX, MEM(RBX,R9,4))
DEC(RDI)
JNZ(SMALL_LOOP)
TEST(RSI, RSI)
JZ(POSTACCUM)
LOOP_ALIGN
LABEL(TAIL_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
SUBITER(0)
ADD(RAX, R8)
ADD(RBX, R9)
DEC(RSI)
JNZ(TAIL_LOOP)
LABEL(POSTACCUM)
#ifdef PREFETCH_A_AFTER
MOV(R8, VAR(a))
PREFETCH(0, MEM(R8,0*64))
PREFETCH(0, MEM(R8,1*64))
PREFETCH(0, MEM(R8,2*64))
PREFETCH(0, MEM(R8,3*64))
PREFETCH(0, MEM(R8,4*64))
PREFETCH(0, MEM(R8,5*64))
PREFETCH(0, MEM(R8,6*64))
PREFETCH(0, MEM(R8,7*64))
#endif
#ifdef PREFETCH_B_AFTER
MOV(R9, VAR(b))
PREFETCH(0, MEM(R9,0*64))
PREFETCH(0, MEM(R9,1*64))
PREFETCH(0, MEM(R9,2*64))
PREFETCH(0, MEM(R9,3*64))
PREFETCH(0, MEM(R9,4*64))
PREFETCH(0, MEM(R9,5*64))
#endif
MOV(RAX, VAR(alpha))
MOV(RBX, VAR(beta))
VBROADCASTSD(ZMM(0), MEM(RAX))
VBROADCASTSD(ZMM(1), MEM(RBX))
MOV(RAX, VAR(cs_c))
LEA(RAX, MEM(,RAX,8))
MOV(RBX, VAR(rs_c))
// Check if C is column stride. If not, jump to the slow scattered update
CMP(RBX, IMM(1))
JNE(SCATTEREDUPDATE)
VCOMISD(XMM(1), XMM(7))
JE(COLSTORBZ)
UPDATE_C( 8, 9,10,11)
UPDATE_C(12,13,14,15)
UPDATE_C(16,17,18,19)
UPDATE_C(20,21,22,23)
UPDATE_C(24,25,26,27)
UPDATE_C(28,29,30,31)
JMP(END)
LABEL(COLSTORBZ)
UPDATE_C_BZ( 8, 9,10,11)
UPDATE_C_BZ(12,13,14,15)
UPDATE_C_BZ(16,17,18,19)
UPDATE_C_BZ(20,21,22,23)
UPDATE_C_BZ(24,25,26,27)
UPDATE_C_BZ(28,29,30,31)
JMP(END)
LABEL(SCATTEREDUPDATE)
MOV(RDI, VAR(offsetPtr))
VMOVDQA64(ZMM(2), MEM(RDI,0*64))
VMOVDQA64(ZMM(3), MEM(RDI,1*64))
VPBROADCASTQ(ZMM(6), RBX)
VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
VCOMISD(XMM(1), XMM(7))
JE(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
UPDATE_C_ROW_SCATTERED(12,13,14,15)
UPDATE_C_ROW_SCATTERED(16,17,18,19)
UPDATE_C_ROW_SCATTERED(20,21,22,23)
UPDATE_C_ROW_SCATTERED(24,25,26,27)
UPDATE_C_ROW_SCATTERED(28,29,30,31)
JMP(END)
LABEL(SCATTERBZ)
UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
LABEL(END)
VZEROUPPER()
: // output operands
: // input operands
[k] "m" (k),
[a] "m" (a),
[b] "m" (b),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[offsetPtr] "m" (offsetPtr)
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
}

View File

@@ -0,0 +1,572 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_avx512_macros.h"
#define CACHELINE_SIZE 64 //size of cache line in bytes
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
/*The pointer of B is moved ahead by one iteration of k
before the loop starts.Therefore, prefetching 3 k iterations
ahead*/
#define B_L1_PREFETCH_DIST 4
#define TAIL_NITER 8
/* During each subiteration, prefetching 2 cache lines of B
* UNROLL factor ahead. 2cache lines = 32 floats (NR).
* */
#define PREFETCH_A_L1(n, k) \
PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*32*4 + (2*n+k) * CACHELINE_SIZE))
#define LOOP_ALIGN ALIGN16
#define UPDATE_C(R1,R2,R3,R4) \
\
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,2))
#define UPDATE_C_BZ(R1,R2,R3,R4) \
\
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
LEA(RCX, MEM(RCX,RAX,2))
#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
KXNORW(K(3), K(0), K(0)) \
KXNORW(K(4), K(0), K(0)) \
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
VFMADD231PS(YMM(R1), YMM(6), YMM(1)) \
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R1)) \
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
KXNORW(K(3), K(0), K(0)) \
KXNORW(K(4), K(0), K(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
VFMADD231PS(YMM(R2), YMM(6), YMM(1)) \
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R2)) \
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
LEA(RDX, MEM(RDX,RAX,1)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
KXNORW(K(3), K(0), K(0)) \
KXNORW(K(4), K(0), K(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
VFMADD231PS(YMM(R3), YMM(6), YMM(1)) \
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R3)) \
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
KXNORW(K(3), K(0), K(0)) \
KXNORW(K(4), K(0), K(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
VFMADD231PS(YMM(R4), YMM(6), YMM(1)) \
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R4)) \
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
LEA(RDX, MEM(RDX,RAX,1))
#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R1)) \
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R2)) \
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
LEA(RDX, MEM(RDX,RAX,1)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R3)) \
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
\
KXNORW(K(1), K(0), K(0)) \
KXNORW(K(2), K(0), K(0)) \
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R4)) \
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
\
LEA(RCX, MEM(RCX,RAX,1)) \
LEA(RDX, MEM(RDX,RAX,1))
#ifdef PREFETCH_C_L2
#undef PREFETCH_C_L2
#define PREFETCH_C_L2 \
\
PREFETCH(1, MEM(RCX, 0*64)) \
PREFETCH(1, MEM(RCX, 1*64)) \
\
PREFETCH(1, MEM(RCX,R12,1,0*64)) \
PREFETCH(1, MEM(RCX,R12,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R12,2,0*64)) \
PREFETCH(1, MEM(RCX,R12,2,1*64)) \
\
PREFETCH(1, MEM(RCX,R13,1,0*64)) \
PREFETCH(1, MEM(RCX,R13,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R12,4,0*64)) \
PREFETCH(1, MEM(RCX,R12,4,1*64)) \
\
PREFETCH(1, MEM(RCX,R14,1,0*64)) \
PREFETCH(1, MEM(RCX,R14,1,1*64)) \
\
PREFETCH(1, MEM(RCX,R13,2,0*64)) \
PREFETCH(1, MEM(RCX,R13,2,1*64)) \
\
PREFETCH(1, MEM(RCX,R15,1,0*64)) \
PREFETCH(1, MEM(RCX,R15,1,1*64)) \
\
PREFETCH(1, MEM(RDX, 0*64)) \
PREFETCH(1, MEM(RDX, 1*64)) \
\
PREFETCH(1, MEM(RDX,R12,1,0*64)) \
PREFETCH(1, MEM(RDX,R12,1,1*64)) \
\
PREFETCH(1, MEM(RDX,R12,2,0*64)) \
PREFETCH(1, MEM(RDX,R12,2,1*64)) \
\
PREFETCH(1, MEM(RDX,R13,1,0*64)) \
PREFETCH(1, MEM(RDX,R13,1,1*64))
#else
#undef PREFETCH_C_L2
#define PREFETCH_C_L2
#endif
#define PREFETCH_C_L1 \
\
PREFETCHW0(MEM(RCX, 0*64)) \
PREFETCHW0(MEM(RCX, 1*64)) \
PREFETCHW0(MEM(RCX,R12,1,0*64)) \
PREFETCHW0(MEM(RCX,R12,1,1*64)) \
PREFETCHW0(MEM(RCX,R12,2,0*64)) \
PREFETCHW0(MEM(RCX,R12,2,1*64)) \
PREFETCHW0(MEM(RCX,R13,1,0*64)) \
PREFETCHW0(MEM(RCX,R13,1,1*64)) \
PREFETCHW0(MEM(RCX,R12,4,0*64)) \
PREFETCHW0(MEM(RCX,R12,4,1*64)) \
PREFETCHW0(MEM(RCX,R14,1,0*64)) \
PREFETCHW0(MEM(RCX,R14,1,1*64)) \
PREFETCHW0(MEM(RCX,R13,2,0*64)) \
PREFETCHW0(MEM(RCX,R13,2,1*64)) \
PREFETCHW0(MEM(RCX,R15,1,0*64)) \
PREFETCHW0(MEM(RCX,R15,1,1*64)) \
PREFETCHW0(MEM(RDX, 0*64)) \
PREFETCHW0(MEM(RDX, 1*64)) \
PREFETCHW0(MEM(RDX,R12,1,0*64)) \
PREFETCHW0(MEM(RDX,R12,1,1*64)) \
PREFETCHW0(MEM(RDX,R12,2,0*64)) \
PREFETCHW0(MEM(RDX,R12,2,1*64)) \
PREFETCHW0(MEM(RDX,R13,1,0*64)) \
PREFETCHW0(MEM(RDX,R13,1,1*64))
//
// n: index in unrolled loop
//
// a: ZMM register to load into
// b: ZMM register to read from
//
// ...: addressing for B, except for offset
//
#define SUBITER(n) \
\
PREFETCH_A_L1(n, 0) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \
\
PREFETCH_A_L1(n, 1) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \
\
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+10)*8)) \
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+11)*8)) \
VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \
VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \
VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \
VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \
\
VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
//This is an array used for the scatter/gather instructions.
static int64_t offsets[16] __attribute__((aligned(64))) =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
void bli_sgemm_skx_asm_32x12_l2(
dim_t k_,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;
const int64_t* offsetPtr = &offsets[0];
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;
__asm__ volatile
(
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
VMOVAPD(YMM( 7), YMM(8))
VMOVAPD(YMM( 9), YMM(8))
VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index
VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a
VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b
VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c
VMOVAPD(YMM(14), YMM(8))
VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*4)) //pre-load a
VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 16*4)) //pre-load a
VMOVAPD(YMM(17), YMM(8))
VMOVAPD(YMM(18), YMM(8))
VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c
VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3
VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5
VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7
VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
VMOVAPD(YMM(24), YMM(8))
VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(32*4)) //mr*sizeof(float)
VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*4)) //nr*sizeof(float)
VMOVAPD(YMM(27), YMM(8))
VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
VMOVAPD(YMM(29), YMM(8))
VMOVAPD(YMM(30), YMM(8))
VMOVAPD(YMM(31), YMM(8))
TEST(RSI, RSI)
JZ(POSTACCUM)
#ifdef PREFETCH_A_BEFORE
/* Prefetching 8 cachlines of A (4 iterations worth of data
(32 (MR) x4 (sizeof(float)) x4 iter /64 = 8 cachelines) */
PREFETCH(0, MEM(RAX,0*64))
PREFETCH(0, MEM(RAX,1*64))
PREFETCH(0, MEM(RAX,2*64))
PREFETCH(0, MEM(RAX,3*64))
PREFETCH(0, MEM(RAX,4*64))
PREFETCH(0, MEM(RAX,5*64))
PREFETCH(0, MEM(RAX,6*64))
PREFETCH(0, MEM(RAX,7*64))
#endif
#ifdef PREFETCH_B_BEFORE
/* Prefetching 3 cachlines of B (4 iterations worth of data
(12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */
PREFETCH(0, MEM(RBX,0*64))
PREFETCH(0, MEM(RBX,1*64))
PREFETCH(0, MEM(RBX,2*64))
#endif
PREFETCH_C_L2
MOV(RDI, RSI)
AND(RSI, IMM(3))
SAR(RDI, IMM(2))
SUB(RDI, IMM(0+TAIL_NITER))
JLE(K_SMALL)
LOOP_ALIGN
LABEL(MAIN_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
SUBITER(0)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
SUBITER(1)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
SUBITER(2)
SUBITER(3)
LEA(RAX, MEM(RAX,R8,4))
LEA(RBX, MEM(RBX,R9,4))
DEC(RDI)
JNZ(MAIN_LOOP)
LABEL(K_SMALL)
PREFETCH_C_L1
ADD(RDI, IMM(0+TAIL_NITER))
JZ(TAIL_LOOP)
LOOP_ALIGN
LABEL(SMALL_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
SUBITER(0)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
SUBITER(1)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
SUBITER(2)
SUBITER(3)
LEA(RAX, MEM(RAX,R8,4))
LEA(RBX, MEM(RBX,R9,4))
DEC(RDI)
JNZ(SMALL_LOOP)
TEST(RSI, RSI)
JZ(POSTACCUM)
LOOP_ALIGN
LABEL(TAIL_LOOP)
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
SUBITER(0)
ADD(RAX, R8)
ADD(RBX, R9)
DEC(RSI)
JNZ(TAIL_LOOP)
LABEL(POSTACCUM)
#ifdef PREFETCH_A_AFTER
MOV(R8, VAR(a))
PREFETCH(0, MEM(R8,0*64))
PREFETCH(0, MEM(R8,1*64))
PREFETCH(0, MEM(R8,2*64))
PREFETCH(0, MEM(R8,3*64))
PREFETCH(0, MEM(R8,4*64))
PREFETCH(0, MEM(R8,5*64))
PREFETCH(0, MEM(R8,6*64))
PREFETCH(0, MEM(R8,7*64))
#endif
#ifdef PREFETCH_B_AFTER
MOV(R9, VAR(b))
PREFETCH(0, MEM(R9,0*64))
PREFETCH(0, MEM(R9,1*64))
PREFETCH(0, MEM(R9,2*64))
#endif
MOV(RAX, VAR(alpha))
MOV(RBX, VAR(beta))
VBROADCASTSS(ZMM(0), MEM(RAX))
VBROADCASTSS(ZMM(1), MEM(RBX))
MOV(RAX, VAR(cs_c))
LEA(RAX, MEM(,RAX,4))
MOV(RBX, VAR(rs_c))
LEA(RBX, MEM(,RBX,4))
// Check if C is column major (rs_c = 1). If not, jump to the slow scattered update
CMP(RBX, IMM(4))
JNE(SCATTEREDUPDATE)
VCOMISD(XMM(1), XMM(7))
JE(COLSTORBZ)
UPDATE_C( 8, 9,10,11)
UPDATE_C(12,13,14,15)
UPDATE_C(16,17,18,19)
UPDATE_C(20,21,22,23)
UPDATE_C(24,25,26,27)
UPDATE_C(28,29,30,31)
JMP(END)
LABEL(COLSTORBZ)
UPDATE_C_BZ( 8, 9,10,11)
UPDATE_C_BZ(12,13,14,15)
UPDATE_C_BZ(16,17,18,19)
UPDATE_C_BZ(20,21,22,23)
UPDATE_C_BZ(24,25,26,27)
UPDATE_C_BZ(28,29,30,31)
JMP(END)
LABEL(SCATTEREDUPDATE)
LEA(RDX, MEM(RCX,RBX,8))
LEA(RDX, MEM(RDX,RBX,8))
MOV(RDI, VAR(offsetPtr))
VMOVDQA64(ZMM(2), MEM(RDI,0*64))
VMOVDQA64(ZMM(3), MEM(RDI,1*64))
VPBROADCASTQ(ZMM(6), RBX)
VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
VCOMISD(XMM(1), XMM(7))
JE(SCATTERBZ)
UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
UPDATE_C_ROW_SCATTERED(12,13,14,15)
UPDATE_C_ROW_SCATTERED(16,17,18,19)
UPDATE_C_ROW_SCATTERED(20,21,22,23)
UPDATE_C_ROW_SCATTERED(24,25,26,27)
UPDATE_C_ROW_SCATTERED(28,29,30,31)
JMP(END)
LABEL(SCATTERBZ)
UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
LABEL(END)
VZEROUPPER()
: // output operands
: // input operands
[k] "m" (k),
[a] "m" (a),
[b] "m" (b),
[alpha] "m" (alpha),
[beta] "m" (beta),
[c] "m" (c),
[rs_c] "m" (rs_c),
[cs_c] "m" (cs_c),
[offsetPtr] "m" (offsetPtr)
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31", "memory"
);
}

View File

@@ -0,0 +1,40 @@
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name of The University of Texas at Austin nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 )
GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 )
GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 )