mirror of
https://github.com/amd/blis.git
synced 2026-05-11 09:39:59 +00:00
Merge branch 'master' of github.com:flame/blis
This commit is contained in:
78
config/skx/bli_cntx_init_skx.c
Normal file
78
config/skx/bli_cntx_init_skx.c
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_cntx_init_skx( cntx_t* cntx )
|
||||
{
|
||||
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
|
||||
|
||||
// Set default kernel blocksizes and functions.
|
||||
bli_cntx_init_skx_ref( cntx );
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Update the context with optimized native gemm micro-kernels and
|
||||
// their storage preferences.
|
||||
bli_cntx_set_l3_nat_ukrs
|
||||
(
|
||||
2,
|
||||
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
|
||||
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x12_l2, FALSE,
|
||||
cntx
|
||||
);
|
||||
|
||||
// Initialize level-3 blocksize objects with architecture-specific values.
|
||||
// s d c z
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 3, 3 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 12, 8, 4 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, 144, 72 );
|
||||
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 384, 256, 256,
|
||||
480, 480, 256, 256 );
|
||||
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, 4080, 4080 );
|
||||
|
||||
// Update the context with the current architecture's register and cache
|
||||
// blocksizes (and multiples) for native execution.
|
||||
bli_cntx_set_blkszs
|
||||
(
|
||||
BLIS_NAT, 5,
|
||||
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
|
||||
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
|
||||
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
|
||||
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
|
||||
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
|
||||
cntx
|
||||
);
|
||||
}
|
||||
|
||||
133
config/skx/bli_family_skx.h
Normal file
133
config/skx/bli_family_skx.h
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
//#ifndef BLIS_FAMILY_H
|
||||
//#define BLIS_FAMILY_H
|
||||
|
||||
// -- THREADING PARAMETERS -----------------------------------------------------
|
||||
|
||||
#define BLIS_DEFAULT_M_THREAD_RATIO 3
|
||||
#define BLIS_DEFAULT_N_THREAD_RATIO 2
|
||||
|
||||
#define BLIS_DEFAULT_MR_THREAD_MAX 1
|
||||
#define BLIS_DEFAULT_NR_THREAD_MAX 4
|
||||
|
||||
// -- MEMORY ALLOCATION --------------------------------------------------------
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 64
|
||||
|
||||
#define BLIS_SIMD_SIZE 64
|
||||
#define BLIS_SIMD_NUM_REGISTERS 32
|
||||
|
||||
#ifdef BLIS_NO_HBWMALLOC
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define BLIS_MALLOC_POOL malloc
|
||||
#define BLIS_FREE_POOL free
|
||||
|
||||
#else
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
#define BLIS_MALLOC_POOL hbw_malloc
|
||||
#define BLIS_FREE_POOL hbw_free
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if 0
|
||||
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
|
||||
|
||||
// -- Cache and register blocksizes --
|
||||
|
||||
//
|
||||
// Constraints:
|
||||
//
|
||||
// (1) MC must be a multiple of:
|
||||
// (a) MR (for zero-padding purposes)
|
||||
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
|
||||
// (2) NC must be a multiple of
|
||||
// (a) NR (for zero-padding purposes)
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
//
|
||||
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2
|
||||
#define BLIS_DEFAULT_MC_D 144
|
||||
#define BLIS_DEFAULT_KC_D 336
|
||||
#define BLIS_DEFAULT_NC_D 5760
|
||||
#define BLIS_DEFAULT_MR_D 16
|
||||
#define BLIS_DEFAULT_NR_D 12
|
||||
#define BLIS_PACKDIM_MR_D 16
|
||||
#define BLIS_PACKDIM_NR_D 12
|
||||
|
||||
// NOTE: If the micro-kernel, which is typically unrolled to a factor
|
||||
// of f, handles leftover edge cases (ie: when k % f > 0) then these
|
||||
// register blocksizes in the k dimension can be defined to 1.
|
||||
|
||||
//#define BLIS_DEFAULT_KR_S 1
|
||||
//#define BLIS_DEFAULT_KR_D 1
|
||||
//#define BLIS_DEFAULT_KR_C 1
|
||||
//#define BLIS_DEFAULT_KR_Z 1
|
||||
|
||||
// -- Maximum cache blocksizes (for optimizing edge cases) --
|
||||
|
||||
// NOTE: These cache blocksize "extensions" have the same constraints as
|
||||
// the corresponding default blocksizes above. When these values are
|
||||
// larger than the default blocksizes, blocksizes used at edge cases are
|
||||
// enlarged if such an extension would encompass the remaining portion of
|
||||
// the matrix dimension.
|
||||
|
||||
#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
|
||||
#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
|
||||
#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0)
|
||||
|
||||
#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
|
||||
#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
|
||||
#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0)
|
||||
|
||||
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
|
||||
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
|
||||
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
|
||||
|
||||
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
|
||||
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
|
||||
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
115
config/skx/make_defs.mk
Normal file
115
config/skx/make_defs.mk
Normal file
@@ -0,0 +1,115 @@
|
||||
#
|
||||
#
|
||||
# BLIS
|
||||
# An object-based framework for developing high-performance BLAS-like
|
||||
# libraries.
|
||||
#
|
||||
# Copyright (C) 2014, The University of Texas at Austin
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are
|
||||
# met:
|
||||
# - Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# - Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# - Neither the name of The University of Texas at Austin nor the names
|
||||
# of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# Declare the name of the current configuration and add it to the
|
||||
# running list of configurations included by common.mk.
|
||||
THIS_CONFIG := skx
|
||||
#CONFIGS_INCL += $(THIS_CONFIG)
|
||||
|
||||
#
|
||||
# --- Determine the C compiler and related flags ---
|
||||
#
|
||||
|
||||
ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -m64
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
|
||||
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0 -DBLIS_NO_HBWMALLOC
|
||||
else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),sde)
|
||||
CPPROCFLAGS += -DBLIS_NO_HBWMALLOC
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CVECFLAGS := -xCORE-AVX512
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# The assembler on OS X won't recognize AVX512 without help
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
ifeq ($(OS_NAME),Darwin)
|
||||
CVECFLAGS += -Wa,-march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cr
|
||||
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
|
||||
ifneq ($(DEBUG_TYPE),sde)
|
||||
LDFLAGS := -lmemkind
|
||||
else
|
||||
LDFLAGS :=
|
||||
endif
|
||||
|
||||
ifneq ($(CC_VENDOR),icc)
|
||||
LDFLAGS += -lm
|
||||
endif
|
||||
|
||||
# Store all of the variables here to new variables containing the
|
||||
# configuration name.
|
||||
$(eval $(call store-make-defs,$(THIS_CONFIG)))
|
||||
|
||||
@@ -18,6 +18,7 @@ haswell: haswell
|
||||
sandybridge: sandybridge
|
||||
penryn: penryn
|
||||
knl: knl
|
||||
skx: skx
|
||||
|
||||
# AMD architectures.
|
||||
zen: zen/haswell
|
||||
|
||||
@@ -48,6 +48,9 @@ arch_t bli_arch_query_id( void )
|
||||
#endif
|
||||
|
||||
// Intel microarchitectures.
|
||||
#ifdef BLIS_FAMILY_SKX
|
||||
id = BLIS_ARCH_SKX;
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_KNL
|
||||
id = BLIS_ARCH_KNL;
|
||||
#endif
|
||||
|
||||
@@ -47,6 +47,10 @@ arch_t bli_cpuid_query_id( void )
|
||||
{
|
||||
// Check for each Intel configuration that is enabled, check for that
|
||||
// microarchitecture. We check from most recent to most dated.
|
||||
#ifdef BLIS_CONFIG_SKX
|
||||
if ( bli_cpuid_is_skx( family, model, features ) )
|
||||
return BLIS_ARCH_SKX;
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_KNL
|
||||
if ( bli_cpuid_is_knl( family, model, features ) )
|
||||
return BLIS_ARCH_KNL;
|
||||
@@ -65,6 +69,8 @@ arch_t bli_cpuid_query_id( void )
|
||||
#endif
|
||||
// If none of the other sub-configurations were detected, return
|
||||
// the 'generic' arch_t id value.
|
||||
printf("generic\n");
|
||||
|
||||
return BLIS_ARCH_GENERIC;
|
||||
}
|
||||
else if ( vendor == VENDOR_AMD )
|
||||
@@ -105,6 +111,31 @@ arch_t bli_cpuid_query_id( void )
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
bool_t bli_cpuid_is_skx
|
||||
(
|
||||
uint32_t family,
|
||||
uint32_t model,
|
||||
uint32_t features
|
||||
)
|
||||
{
|
||||
// Check for expected CPU features.
|
||||
const uint32_t expected = FEATURE_AVX |
|
||||
FEATURE_FMA3 |
|
||||
FEATURE_AVX2 |
|
||||
FEATURE_AVX512F |
|
||||
FEATURE_AVX512DQ |
|
||||
FEATURE_AVX512BW |
|
||||
FEATURE_AVX512VL ;
|
||||
|
||||
|
||||
int nvpu = vpu_count();
|
||||
|
||||
if ( !bli_cpuid_has_features( features, expected ) || nvpu != 2 ) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
bool_t bli_cpuid_is_knl
|
||||
(
|
||||
@@ -629,6 +660,95 @@ uint32_t bli_cpuid_query
|
||||
return VENDOR_UNKNOWN;
|
||||
}
|
||||
|
||||
void get_cpu_name(char *cpu_name)
|
||||
{
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
|
||||
__cpuid(0x80000002u, eax, ebx, ecx, edx);
|
||||
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
|
||||
|
||||
*(uint32_t *)&cpu_name[0] = eax;
|
||||
*(uint32_t *)&cpu_name[4] = ebx;
|
||||
*(uint32_t *)&cpu_name[8] = ecx;
|
||||
*(uint32_t *)&cpu_name[12] = edx;
|
||||
|
||||
__cpuid(0x80000003u, eax, ebx, ecx, edx);
|
||||
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
|
||||
|
||||
*(uint32_t *)&cpu_name[16+0] = eax;
|
||||
*(uint32_t *)&cpu_name[16+4] = ebx;
|
||||
*(uint32_t *)&cpu_name[16+8] = ecx;
|
||||
*(uint32_t *)&cpu_name[16+12] = edx;
|
||||
|
||||
__cpuid(0x80000004u, eax, ebx, ecx, edx);
|
||||
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
|
||||
|
||||
*(uint32_t *)&cpu_name[32+0] = eax;
|
||||
*(uint32_t *)&cpu_name[32+4] = ebx;
|
||||
*(uint32_t *)&cpu_name[32+8] = ecx;
|
||||
*(uint32_t *)&cpu_name[32+12] = edx;
|
||||
|
||||
}
|
||||
|
||||
int vpu_count()
|
||||
{
|
||||
char cpu_name[48] = {};
|
||||
char *loc;
|
||||
char model_num[5];
|
||||
int sku;
|
||||
|
||||
get_cpu_name(cpu_name);
|
||||
|
||||
if (strstr(cpu_name, "Intel(R) Xeon(R)") != NULL)
|
||||
{
|
||||
loc = strstr(cpu_name, "Platinum");
|
||||
if (loc == NULL)
|
||||
loc = strstr(cpu_name, "Gold");
|
||||
if (loc == NULL)
|
||||
loc = strstr(cpu_name, "Silver");
|
||||
if (loc == NULL)
|
||||
loc = strstr(cpu_name, "Bronze");
|
||||
if (loc == NULL)
|
||||
loc = strstr(cpu_name, "W");
|
||||
if (loc == NULL)
|
||||
return -1;
|
||||
|
||||
loc = strstr(loc+1," ");
|
||||
if(loc == NULL)
|
||||
return -1;
|
||||
|
||||
strncpy(model_num, loc+1, 4);
|
||||
model_num[5] = '\0';
|
||||
|
||||
sku = atoi(model_num);
|
||||
|
||||
if (8199 >= sku && sku >= 8100) return 2;
|
||||
else if (6199 >= sku && sku >= 6100) return 2;
|
||||
else if (sku == 5122) return 2;
|
||||
else if (5199 >= sku && sku >= 5100) return 1;
|
||||
else if (4199 >= sku && sku >= 4100) return 1;
|
||||
else if (3199 >= sku && sku >= 3100) return 1;
|
||||
else if (2199 >= sku && sku >= 2120) return 2;
|
||||
else if (2119 >= sku && sku >= 2100) return 1;
|
||||
else return -1;
|
||||
}
|
||||
else if (strstr(cpu_name, "Intel(R) Core(TM) i9") != NULL)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
else if (strstr(cpu_name, "Intel(R) Core(TM) i7") != NULL)
|
||||
{
|
||||
if (strstr(cpu_name, "7800X") != NULL ||
|
||||
strstr(cpu_name, "7820X") != NULL)
|
||||
return 1;
|
||||
else return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
|
||||
|
||||
int get_cpu_type( int* model, int* part, int* features )
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
|
||||
arch_t bli_cpuid_query_id( void );
|
||||
|
||||
bool_t bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features );
|
||||
bool_t bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features );
|
||||
bool_t bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features );
|
||||
bool_t bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features );
|
||||
@@ -100,6 +101,10 @@ static bool_t bli_cpuid_has_features( uint32_t have, uint32_t want )
|
||||
|
||||
#include "cpuid.h"
|
||||
|
||||
void get_cpu_name(char *cpu_name);
|
||||
int vpu_count();
|
||||
|
||||
|
||||
enum
|
||||
{
|
||||
VENDOR_INTEL,
|
||||
|
||||
@@ -71,6 +71,11 @@ void bli_gks_init( void )
|
||||
// bli_config.h.
|
||||
|
||||
// Intel architectures
|
||||
#ifdef BLIS_CONFIG_SKX
|
||||
bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx,
|
||||
bli_cntx_init_skx_ref,
|
||||
bli_cntx_init_skx_ind );
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_KNL
|
||||
bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl,
|
||||
bli_cntx_init_knl_ref,
|
||||
|
||||
@@ -41,7 +41,9 @@
|
||||
//
|
||||
|
||||
// -- Intel64 architectures --
|
||||
|
||||
#ifdef BLIS_CONFIG_SKX
|
||||
CNTX_INIT_PROTS( skx )
|
||||
#endif
|
||||
#ifdef BLIS_CONFIG_KNL
|
||||
CNTX_INIT_PROTS( knl )
|
||||
#endif
|
||||
@@ -121,7 +123,9 @@ CNTX_INIT_PROTS( generic )
|
||||
#endif
|
||||
|
||||
// -- Intel64 architectures --
|
||||
|
||||
#ifdef BLIS_FAMILY_SKX
|
||||
#include "bli_family_skx.h"
|
||||
#endif
|
||||
#ifdef BLIS_FAMILY_KNL
|
||||
#include "bli_family_knl.h"
|
||||
#endif
|
||||
@@ -189,7 +193,9 @@ CNTX_INIT_PROTS( generic )
|
||||
//
|
||||
|
||||
// -- Intel64 architectures --
|
||||
|
||||
#ifdef BLIS_KERNELS_SKX
|
||||
#include "bli_kernels_skx.h"
|
||||
#endif
|
||||
#ifdef BLIS_KERNELS_KNL
|
||||
#include "bli_kernels_knl.h"
|
||||
#endif
|
||||
|
||||
@@ -815,7 +815,8 @@ typedef enum
|
||||
typedef enum
|
||||
{
|
||||
// Intel
|
||||
BLIS_ARCH_KNL = 0,
|
||||
BLIS_ARCH_SKX =0,
|
||||
BLIS_ARCH_KNL,
|
||||
BLIS_ARCH_KNC,
|
||||
BLIS_ARCH_HASWELL,
|
||||
BLIS_ARCH_SANDYBRIDGE,
|
||||
@@ -842,7 +843,7 @@ typedef enum
|
||||
|
||||
} arch_t;
|
||||
|
||||
#define BLIS_NUM_ARCHS 16
|
||||
#define BLIS_NUM_ARCHS 17
|
||||
|
||||
|
||||
//
|
||||
|
||||
171
kernels/skx/3/bli_avx512_macros.h
Normal file
171
kernels/skx/3/bli_avx512_macros.h
Normal file
@@ -0,0 +1,171 @@
|
||||
#ifndef BLIS_AVX512_MACROS_H
|
||||
#define BLIS_AVX512_MACROS_H
|
||||
|
||||
//
|
||||
// Assembly macros to make AVX-512 with AT&T syntax somewhat less painful
|
||||
//
|
||||
|
||||
#define COMMENT_BEGIN "#"
|
||||
#define COMMENT_END
|
||||
|
||||
#define STRINGIFY(...) #__VA_ARGS__
|
||||
#define ASM(...) STRINGIFY(__VA_ARGS__) "\n\t"
|
||||
#define LABEL(label) STRINGIFY(label) ":\n\t"
|
||||
|
||||
#define XMM(x) %%xmm##x
|
||||
#define YMM(x) %%ymm##x
|
||||
#define ZMM(x) %%zmm##x
|
||||
#define EAX %%eax
|
||||
#define EBX %%ebx
|
||||
#define ECX %%ecx
|
||||
#define EDX %%edx
|
||||
#define EBP %%ebp
|
||||
#define EDI %%edi
|
||||
#define ESI %%esi
|
||||
#define RAX %%rax
|
||||
#define RBX %%rbx
|
||||
#define RCX %%rcx
|
||||
#define RDX %%rdx
|
||||
#define RBP %%rbp
|
||||
#define RDI %%rdi
|
||||
#define RSI %%rsi
|
||||
#define K(x) %%k##x
|
||||
#define R(x) %%r##x
|
||||
#define R8 %%r8
|
||||
#define R9 %%r9
|
||||
#define R10 %%r10
|
||||
#define R11 %%r11
|
||||
#define R12 %%r12
|
||||
#define R13 %%r13
|
||||
#define R14 %%r14
|
||||
#define R15 %%r15
|
||||
#define RD(x) %%r##x##d
|
||||
#define R8D %%r8d
|
||||
#define R9D %%r9d
|
||||
#define R10D %%r10d
|
||||
#define R11D %%r11d
|
||||
#define R12D %%r12d
|
||||
#define R13D %%r13d
|
||||
#define R14D %%r14d
|
||||
#define R15D %%r15d
|
||||
#define IMM(x) $##x
|
||||
#define VAR(x) %[x]
|
||||
|
||||
#define MEM_4(reg,off,scale,disp) disp(reg,off,scale)
|
||||
#define MEM_3(reg,off,scale) (reg,off,scale)
|
||||
#define MEM_2(reg,disp) disp(reg)
|
||||
#define MEM_1(reg) (reg)
|
||||
|
||||
#define MEM_1TO8_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to8%}
|
||||
#define MEM_1TO8_3(reg,off,scale) MEM(reg,off,scale) %{1to8%}
|
||||
#define MEM_1TO8_2(reg,disp) MEM(reg,disp) %{1to8%}
|
||||
#define MEM_1TO8_1(reg) MEM(reg) %{1to8%}
|
||||
|
||||
#define MEM_1TO16_4(reg,off,scale,disp) MEM(reg,off,scale,disp) %{1to16%}
|
||||
#define MEM_1TO16_3(reg,off,scale) MEM(reg,off,scale) %{1to16%}
|
||||
#define MEM_1TO16_2(reg,disp) MEM(reg,disp) %{1to16%}
|
||||
#define MEM_1TO16_1(reg) MEM(reg) %{1to16%}
|
||||
|
||||
#define GET_MACRO(_1,_2,_3,_4,NAME,...) NAME
|
||||
#define MEM(...) GET_MACRO(__VA_ARGS__,MEM_4,MEM_3,MEM_2,MEM_1)(__VA_ARGS__)
|
||||
#define MEM_1TO8(...) GET_MACRO(__VA_ARGS__,MEM_1TO8_4,MEM_1TO8_3,MEM_1TO8_2,MEM_1TO8_1)(__VA_ARGS__)
|
||||
#define MEM_1TO16(...) GET_MACRO(__VA_ARGS__,MEM_1TO16_4,MEM_1TO16_3,MEM_1TO16_2,MEM_1TO16_1)(__VA_ARGS__)
|
||||
|
||||
#define MASK_K(n) %{%%k##n%}
|
||||
#define MASK_KZ(n) %{%%k##n%}%{z%}
|
||||
#define KMOV(to,from) ASM(kmovw from, to)
|
||||
#define JKNZD(kreg,label) \
|
||||
ASM(kortestw kreg, kreg) \
|
||||
ASM(jnz label)
|
||||
#define KXNORW(_0, _1, _2) ASM(kxnorw _2, _1, _0)
|
||||
#define KSHIFTRW(_0, _1, _2) ASM(kshiftrw _2, _1, _0)
|
||||
|
||||
#define ALIGN16 ASM(.p2align 4)
|
||||
#define ALIGN32 ASM(.p2align 5)
|
||||
#define RDTSC ASM(rdstc)
|
||||
#define MOV(_0, _1) ASM(mov _1, _0)
|
||||
#define MOVD(_0, _1) ASM(movd _1, _0)
|
||||
#define MOVL(_0, _1) ASM(movl _1, _0)
|
||||
#define MOVQ(_0, _1) ASM(movq _1, _0)
|
||||
#define VMOVD(_0, _1) ASM(vmovd _1, _0)
|
||||
#define VMOVQ(_0, _1) ASM(vmovq _1, _0)
|
||||
#define CMP(_0, _1) ASM(cmp _1, _0)
|
||||
#define AND(_0, _1) ASM(and _1, _0)
|
||||
#define ADD(_0, _1) ASM(add _1, _0)
|
||||
#define SUB(_0, _1) ASM(sub _1, _0)
|
||||
#define SAL(_0, _1) ASM(sal _1, _0)
|
||||
#define SHLX(_0, _1, _2) ASM(shlx _2, _1, _0)
|
||||
#define SAR(_0, _1) ASM(sar _1, _0)
|
||||
#define SAL1(_0) ASM(sal _0)
|
||||
#define SAR1(_0) ASM(sar _0)
|
||||
#define LEA(_0, _1) ASM(lea _1, _0)
|
||||
#define TEST(_0, _1) ASM(test _1, _0)
|
||||
#define DEC(_0) ASM(dec _0)
|
||||
#define JLE(_0) ASM(jle _0)
|
||||
#define JL(_0) ASM(jl _0)
|
||||
#define JNZ(_0) ASM(jnz _0)
|
||||
#define JZ(_0) ASM(jz _0)
|
||||
#define JNE(_0) ASM(jne _0)
|
||||
#define JE(_0) ASM(je _0)
|
||||
#define JNC(_0) ASM(jnc _0)
|
||||
#define JC(_0) ASM(jc _0)
|
||||
#define JMP(_0) ASM(jmp _0)
|
||||
#define VCOMISS(_0, _1) ASM(vcomiss _1, _0)
|
||||
#define VCOMISD(_0, _1) ASM(vcomisd _1, _0)
|
||||
#define VGATHERDPS(_0, _1) ASM(vgatherdps _1, _0)
|
||||
#define VSCATTERDPS(_0, _1) ASM(vscatterdps _1, _0)
|
||||
#define VGATHERDPD(_0, _1) ASM(vgatherdpd _1, _0)
|
||||
#define VSCATTERDPD(_0, _1) ASM(vscatterdpd _1, _0)
|
||||
#define VGATHERQPS(_0, _1) ASM(vgatherqps _1, _0)
|
||||
#define VSCATTERQPS(_0, _1) ASM(vscatterqps _1, _0)
|
||||
#define VGATHERQPD(_0, _1) ASM(vgatherqpd _1, _0)
|
||||
#define VSCATTERQPD(_0, _1) ASM(vscatterqpd _1, _0)
|
||||
#define VMULSS(_0, _1, _2) ASM(vmulss _2, _1, _0)
|
||||
#define VMULSD(_0, _1, _2) ASM(vmulsd _2, _1, _0)
|
||||
#define VMULPS(_0, _1, _2) ASM(vmulps _2, _1, _0)
|
||||
#define VMULPD(_0, _1, _2) ASM(vmulpd _2, _1, _0)
|
||||
#define VPMULLD(_0, _1, _2) ASM(vpmulld _2, _1, _0)
|
||||
#define VPMULLQ(_0, _1, _2) ASM(vpmullq _2, _1, _0)
|
||||
#define VPADDD(_0, _1, _2) ASM(vpaddd _2, _1, _0)
|
||||
#define VPSLLD(_0, _1, _2) ASM(vpslld _2, _1, _0)
|
||||
#define VPXORD(_0, _1, _2) ASM(vpxord _2, _1, _0)
|
||||
#define VXORPD(_0, _1, _2) ASM(vxorpd _2, _1, _0)
|
||||
#define VFMADD132PS(_0, _1, _2) ASM(vfmadd132ps _2, _1, _0)
|
||||
#define VFMADD213PS(_0, _1, _2) ASM(vfmadd213ps _2, _1, _0)
|
||||
#define VFMADD231PS(_0, _1, _2) ASM(vfmadd231ps _2, _1, _0)
|
||||
#define VFMADD132PD(_0, _1, _2) ASM(vfmadd132pd _2, _1, _0)
|
||||
#define VFMADD213PD(_0, _1, _2) ASM(vfmadd213pd _2, _1, _0)
|
||||
#define VFMADD231PD(_0, _1, _2) ASM(vfmadd231pd _2, _1, _0)
|
||||
#define VMOVDQA(_0, _1) ASM(vmovdqa _1, _0)
|
||||
#define VMOVDQA32(_0, _1) ASM(vmovdqa32 _1, _0)
|
||||
#define VMOVDQA64(_0, _1) ASM(vmovdqa64 _1, _0)
|
||||
#define VMOVSS(_0, _1) ASM(vmovss _1, _0)
|
||||
#define VMOVSD(_0, _1) ASM(vmovsd _1, _0)
|
||||
#define VMOVAPS(_0, _1) ASM(vmovaps _1, _0)
|
||||
#define VMOVUPS(_0, _1) ASM(vmovups _1, _0)
|
||||
#define VMOVAPD(_0, _1) ASM(vmovapd _1, _0)
|
||||
#define VMOVUPD(_0, _1) ASM(vmovupd _1, _0)
|
||||
#define VBROADCASTSS(_0, _1) ASM(vbroadcastss _1, _0)
|
||||
#define VBROADCASTSD(_0, _1) ASM(vbroadcastsd _1, _0)
|
||||
#define VPBROADCASTD(_0, _1) ASM(vpbroadcastd _1, _0)
|
||||
#define VPBROADCASTQ(_0, _1) ASM(vpbroadcastq _1, _0)
|
||||
#define VBROADCASTF64X4(_0, _1) ASM(vbroadcastf64x4 _1, _0)
|
||||
#define VINSERTF64X4(_0, _1, _2, _3) ASM(vinsertf64x4 _3, _2, _1, _0)
|
||||
#define VEXTRACTF64X4(_0, _1, _2) ASM(vextractf64x4 _2, _1, _0)
|
||||
#define VUNPCKLPD(_0, _1, _2) ASM(vunpcklpd _2, _1, _0)
|
||||
#define VUNPCKHPD(_0, _1, _2) ASM(vunpckhpd _2, _1, _0)
|
||||
#define VSHUFF64X2(_0, _1, _2, _3) ASM(vshuff64x2 _3, _2, _1, _0)
|
||||
#define VUNPCKLPS(_0, _1, _2) ASM(vunpcklps _2, _1, _0)
|
||||
#define VUNPCKHPS(_0, _1, _2) ASM(vunpckhps _2, _1, _0)
|
||||
#define VSHUFPS(_0, _1, _2, _3) ASM(vshufps _3, _2, _1, _0)
|
||||
#define VPERM2F128(_0, _1, _2, _3) ASM(vperm2f128 _3, _2, _1, _0)
|
||||
#define PREFETCH(LEVEL,ADDRESS) ASM(prefetcht##LEVEL ADDRESS)
|
||||
#define PREFETCHW0(ADDRESS) ASM(prefetchw ADDRESS)
|
||||
#define PREFETCHW1(ADDRESS) ASM(prefetchwt1 ADDRESS)
|
||||
#define VGATHERPFDPS(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dps ADDRESS)
|
||||
#define VSCATTERPFDPS(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dps ADDRESS)
|
||||
#define VGATHERPFDPD(LEVEL,ADDRESS) ASM(vgatherpf##LEVEL##dpd ADDRESS)
|
||||
#define VSCATTERPFDPD(LEVEL,ADDRESS) ASM(vscatterpf##LEVEL##dpd ADDRESS)
|
||||
#define VZEROUPPER() ASM(vzeroupper)
|
||||
|
||||
#endif
|
||||
547
kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
Normal file
547
kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
Normal file
@@ -0,0 +1,547 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
||||
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#include <assert.h>
|
||||
|
||||
#include "bli_avx512_macros.h"
|
||||
|
||||
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
|
||||
|
||||
/*The pointer of B is moved ahead by one iteration of k
|
||||
before the loop starts.Therefore, prefetching 3 k iterations
|
||||
ahead*/
|
||||
#define B_L1_PREFETCH_DIST 4
|
||||
|
||||
#define TAIL_NITER 8
|
||||
|
||||
#define CACHELINE_SIZE 64 //size of cache line in bytes
|
||||
|
||||
/* During each subiteration, prefetching 2 cache lines of B
|
||||
* UNROLL factor ahead. 2cache lines = 16 doubles (NR).
|
||||
* */
|
||||
#define PREFETCH_A_L1(n, k) \
|
||||
PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k) * CACHELINE_SIZE))
|
||||
|
||||
/* Preloading B for the first iteration of the main loop.
|
||||
* for subiter(1), subiter(2), and subiter(3) */
|
||||
#define PREFETCH_B_L1_1ITER \
|
||||
PREFETCH(0, MEM(RBX )) \
|
||||
PREFETCH(0, MEM(RBX, CACHELINE_SIZE)) \
|
||||
PREFETCH(0, MEM(RBX, 2*CACHELINE_SIZE)) \
|
||||
PREFETCH(0, MEM(RBX, 3*CACHELINE_SIZE)) \
|
||||
PREFETCH(0, MEM(RBX, 4*CACHELINE_SIZE)) \
|
||||
PREFETCH(0, MEM(RBX, 5*CACHELINE_SIZE))
|
||||
|
||||
#define LOOP_ALIGN ALIGN16
|
||||
|
||||
#define UPDATE_C(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
|
||||
VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
|
||||
VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
|
||||
VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
|
||||
VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
|
||||
VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
|
||||
VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
|
||||
VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,2))
|
||||
|
||||
#define UPDATE_C_BZ(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \
|
||||
VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \
|
||||
VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
|
||||
VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,2))
|
||||
|
||||
#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
|
||||
VFMADD231PD(ZMM(R1), ZMM(6), ZMM(1)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
|
||||
VFMADD231PD(ZMM(R2), ZMM(6), ZMM(1)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R2)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
|
||||
VFMADD231PD(ZMM(R3), ZMM(6), ZMM(1)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(2), ZMM(R3)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),8)) \
|
||||
VFMADD231PD(ZMM(R4), ZMM(6), ZMM(1)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(2), ZMM(R4)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1))
|
||||
|
||||
#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R2)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(2),8) MASK_K(1), ZMM(R3)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VSCATTERQPD(MEM(RCX,ZMM(3),8) MASK_K(1), ZMM(R4)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1))
|
||||
|
||||
#ifdef PREFETCH_C_L2
|
||||
#undef PREFETCH_C_L2
|
||||
#define PREFETCH_C_L2 \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX, 0*64)) \
|
||||
PREFETCH(1, MEM(RCX, 1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,2,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R13,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R13,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,4,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,4,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R14,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R14,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R13,2,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R13,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R15,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R15,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX, 0*64)) \
|
||||
PREFETCH(1, MEM(RDX, 1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R12,1,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R12,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R12,2,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R12,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R13,1,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R13,1,1*64))
|
||||
|
||||
#else
|
||||
#undef PREFETCH_C_L2
|
||||
#define PREFETCH_C_L2
|
||||
#endif
|
||||
|
||||
|
||||
#define PREFETCH_C_L1 \
|
||||
\
|
||||
PREFETCHW0(MEM(RCX, 0*64)) \
|
||||
PREFETCHW0(MEM(RCX, 1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,2,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,2,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,4,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,4,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R14,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R14,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,2,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,2,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R15,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R15,1,1*64)) \
|
||||
PREFETCHW0(MEM(RDX, 0*64)) \
|
||||
PREFETCHW0(MEM(RDX, 1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,1,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,1,1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,2,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,2,1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R13,1,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R13,1,1*64))
|
||||
|
||||
//
|
||||
// n: index in unrolled loop
|
||||
//
|
||||
// a: ZMM register to load into
|
||||
// b: ZMM register to read from
|
||||
//
|
||||
// ...: addressing for A, except for offset
|
||||
//
|
||||
#define SUBITER(n) \
|
||||
\
|
||||
PREFETCH_A_L1(n, 0) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
|
||||
VFMADD231PD(ZMM( 8), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM( 9), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(10), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(11), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
|
||||
VFMADD231PD(ZMM(12), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(13), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
|
||||
VFMADD231PD(ZMM(16), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(17), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(18), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(19), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
PREFETCH_A_L1(n, 1) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
|
||||
VFMADD231PD(ZMM(20), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(21), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(22), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(23), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
|
||||
VFMADD231PD(ZMM(24), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(25), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+10)*8)) \
|
||||
VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+11)*8)) \
|
||||
VFMADD231PD(ZMM(28), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(29), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PD(ZMM(30), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PD(ZMM(31), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
|
||||
VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
static int64_t offsets[16] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
|
||||
|
||||
|
||||
void bli_dgemm_skx_asm_16x12_l2(
|
||||
dim_t k_,
|
||||
double* restrict alpha,
|
||||
double* restrict a,
|
||||
double* restrict b,
|
||||
double* restrict beta,
|
||||
double* restrict c, inc_t rs_c_, inc_t cs_c_,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
(void)data;
|
||||
(void)cntx;
|
||||
|
||||
const int64_t* offsetPtr = &offsets[0];
|
||||
const int64_t k = k_;
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
|
||||
VMOVAPD(YMM( 7), YMM(8))
|
||||
VMOVAPD(YMM( 9), YMM(8))
|
||||
VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index
|
||||
VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a
|
||||
VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b
|
||||
VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c
|
||||
VMOVAPD(YMM(14), YMM(8))
|
||||
VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a
|
||||
VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a
|
||||
VMOVAPD(YMM(17), YMM(8))
|
||||
VMOVAPD(YMM(18), YMM(8))
|
||||
VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c
|
||||
VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3
|
||||
VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5
|
||||
VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7
|
||||
VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
|
||||
VMOVAPD(YMM(24), YMM(8))
|
||||
VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(16*8)) //mr*sizeof(double)
|
||||
VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*8)) //nr*sizeof(double)
|
||||
VMOVAPD(YMM(27), YMM(8))
|
||||
VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
|
||||
VMOVAPD(YMM(29), YMM(8))
|
||||
VMOVAPD(YMM(30), YMM(8))
|
||||
VMOVAPD(YMM(31), YMM(8))
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
#ifdef PREFETCH_A_BEFORE
|
||||
PREFETCH(0, MEM(RAX,0*64))
|
||||
PREFETCH(0, MEM(RAX,1*64))
|
||||
PREFETCH(0, MEM(RAX,2*64))
|
||||
PREFETCH(0, MEM(RAX,3*64))
|
||||
PREFETCH(0, MEM(RAX,4*64))
|
||||
PREFETCH(0, MEM(RAX,5*64))
|
||||
PREFETCH(0, MEM(RAX,6*64))
|
||||
PREFETCH(0, MEM(RAX,7*64))
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef PREFETCH_B_BEFORE
|
||||
PREFETCH(0, MEM(RBX,0*64))
|
||||
PREFETCH(0, MEM(RBX,1*64))
|
||||
PREFETCH(0, MEM(RBX,2*64))
|
||||
PREFETCH(0, MEM(RBX,3*64))
|
||||
PREFETCH(0, MEM(RBX,4*64))
|
||||
PREFETCH(0, MEM(RBX,5*64))
|
||||
#endif
|
||||
|
||||
PREFETCH_C_L2
|
||||
|
||||
MOV(RDI, RSI)
|
||||
AND(RSI, IMM(3))
|
||||
SAR(RDI, IMM(2))
|
||||
|
||||
SUB(RDI, IMM(0+TAIL_NITER))
|
||||
JLE(K_SMALL)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(MAIN_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
|
||||
SUBITER(0)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
|
||||
SUBITER(1)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
|
||||
SUBITER(2)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
|
||||
SUBITER(3)
|
||||
|
||||
LEA(RAX, MEM(RAX,R8,4))
|
||||
LEA(RBX, MEM(RBX,R9,4))
|
||||
|
||||
DEC(RDI)
|
||||
|
||||
JNZ(MAIN_LOOP)
|
||||
|
||||
LABEL(K_SMALL)
|
||||
|
||||
PREFETCH_C_L1
|
||||
|
||||
ADD(RDI, IMM(0+TAIL_NITER))
|
||||
JZ(TAIL_LOOP)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(SMALL_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
|
||||
SUBITER(0)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128))
|
||||
SUBITER(1)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192))
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256))
|
||||
SUBITER(2)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320))
|
||||
SUBITER(3)
|
||||
|
||||
LEA(RAX, MEM(RAX,R8,4))
|
||||
LEA(RBX, MEM(RBX,R9,4))
|
||||
|
||||
DEC(RDI)
|
||||
|
||||
JNZ(SMALL_LOOP)
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(TAIL_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8))
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64))
|
||||
SUBITER(0)
|
||||
|
||||
ADD(RAX, R8)
|
||||
ADD(RBX, R9)
|
||||
|
||||
DEC(RSI)
|
||||
|
||||
JNZ(TAIL_LOOP)
|
||||
|
||||
LABEL(POSTACCUM)
|
||||
|
||||
#ifdef PREFETCH_A_AFTER
|
||||
MOV(R8, VAR(a))
|
||||
PREFETCH(0, MEM(R8,0*64))
|
||||
PREFETCH(0, MEM(R8,1*64))
|
||||
PREFETCH(0, MEM(R8,2*64))
|
||||
PREFETCH(0, MEM(R8,3*64))
|
||||
PREFETCH(0, MEM(R8,4*64))
|
||||
PREFETCH(0, MEM(R8,5*64))
|
||||
PREFETCH(0, MEM(R8,6*64))
|
||||
PREFETCH(0, MEM(R8,7*64))
|
||||
#endif
|
||||
|
||||
#ifdef PREFETCH_B_AFTER
|
||||
MOV(R9, VAR(b))
|
||||
PREFETCH(0, MEM(R9,0*64))
|
||||
PREFETCH(0, MEM(R9,1*64))
|
||||
PREFETCH(0, MEM(R9,2*64))
|
||||
PREFETCH(0, MEM(R9,3*64))
|
||||
PREFETCH(0, MEM(R9,4*64))
|
||||
PREFETCH(0, MEM(R9,5*64))
|
||||
#endif
|
||||
|
||||
MOV(RAX, VAR(alpha))
|
||||
MOV(RBX, VAR(beta))
|
||||
VBROADCASTSD(ZMM(0), MEM(RAX))
|
||||
VBROADCASTSD(ZMM(1), MEM(RBX))
|
||||
|
||||
MOV(RAX, VAR(cs_c))
|
||||
LEA(RAX, MEM(,RAX,8))
|
||||
MOV(RBX, VAR(rs_c))
|
||||
|
||||
// Check if C is column stride. If not, jump to the slow scattered update
|
||||
CMP(RBX, IMM(1))
|
||||
JNE(SCATTEREDUPDATE)
|
||||
|
||||
VCOMISD(XMM(1), XMM(7))
|
||||
JE(COLSTORBZ)
|
||||
|
||||
UPDATE_C( 8, 9,10,11)
|
||||
UPDATE_C(12,13,14,15)
|
||||
UPDATE_C(16,17,18,19)
|
||||
UPDATE_C(20,21,22,23)
|
||||
UPDATE_C(24,25,26,27)
|
||||
UPDATE_C(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(COLSTORBZ)
|
||||
|
||||
UPDATE_C_BZ( 8, 9,10,11)
|
||||
UPDATE_C_BZ(12,13,14,15)
|
||||
UPDATE_C_BZ(16,17,18,19)
|
||||
UPDATE_C_BZ(20,21,22,23)
|
||||
UPDATE_C_BZ(24,25,26,27)
|
||||
UPDATE_C_BZ(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(SCATTEREDUPDATE)
|
||||
|
||||
MOV(RDI, VAR(offsetPtr))
|
||||
VMOVDQA64(ZMM(2), MEM(RDI,0*64))
|
||||
VMOVDQA64(ZMM(3), MEM(RDI,1*64))
|
||||
VPBROADCASTQ(ZMM(6), RBX)
|
||||
VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
|
||||
VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
|
||||
|
||||
VCOMISD(XMM(1), XMM(7))
|
||||
JE(SCATTERBZ)
|
||||
|
||||
UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
|
||||
UPDATE_C_ROW_SCATTERED(12,13,14,15)
|
||||
UPDATE_C_ROW_SCATTERED(16,17,18,19)
|
||||
UPDATE_C_ROW_SCATTERED(20,21,22,23)
|
||||
UPDATE_C_ROW_SCATTERED(24,25,26,27)
|
||||
UPDATE_C_ROW_SCATTERED(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(SCATTERBZ)
|
||||
|
||||
UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
|
||||
|
||||
LABEL(END)
|
||||
|
||||
VZEROUPPER()
|
||||
|
||||
: // output operands
|
||||
: // input operands
|
||||
[k] "m" (k),
|
||||
[a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[offsetPtr] "m" (offsetPtr)
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
|
||||
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
|
||||
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
|
||||
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
|
||||
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31", "memory"
|
||||
);
|
||||
}
|
||||
572
kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
Normal file
572
kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
Normal file
@@ -0,0 +1,572 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
||||
OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
#include "bli_avx512_macros.h"
|
||||
|
||||
#define CACHELINE_SIZE 64 //size of cache line in bytes
|
||||
|
||||
#define A_L1_PREFETCH_DIST 4 //should be multiple of 2
|
||||
|
||||
/*The pointer of B is moved ahead by one iteration of k
|
||||
before the loop starts.Therefore, prefetching 3 k iterations
|
||||
ahead*/
|
||||
#define B_L1_PREFETCH_DIST 4
|
||||
|
||||
#define TAIL_NITER 8
|
||||
|
||||
|
||||
/* During each subiteration, prefetching 2 cache lines of B
|
||||
* UNROLL factor ahead. 2cache lines = 32 floats (NR).
|
||||
* */
|
||||
#define PREFETCH_A_L1(n, k) \
|
||||
PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*32*4 + (2*n+k) * CACHELINE_SIZE))
|
||||
|
||||
#define LOOP_ALIGN ALIGN16
|
||||
|
||||
#define UPDATE_C(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
|
||||
VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
|
||||
VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
|
||||
VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
|
||||
VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
|
||||
VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,2))
|
||||
|
||||
#define UPDATE_C_BZ(R1,R2,R3,R4) \
|
||||
\
|
||||
VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
|
||||
VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
|
||||
VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
|
||||
LEA(RCX, MEM(RCX,RAX,2))
|
||||
|
||||
#define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
KXNORW(K(3), K(0), K(0)) \
|
||||
KXNORW(K(4), K(0), K(0)) \
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
|
||||
VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
|
||||
VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
|
||||
VFMADD231PS(YMM(R1), YMM(6), YMM(1)) \
|
||||
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
KXNORW(K(3), K(0), K(0)) \
|
||||
KXNORW(K(4), K(0), K(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
|
||||
VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
|
||||
VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
|
||||
VFMADD231PS(YMM(R2), YMM(6), YMM(1)) \
|
||||
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R2)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
LEA(RDX, MEM(RDX,RAX,1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
KXNORW(K(3), K(0), K(0)) \
|
||||
KXNORW(K(4), K(0), K(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
|
||||
VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),8)) \
|
||||
VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),8)) \
|
||||
VFMADD231PS(YMM(R3), YMM(6), YMM(1)) \
|
||||
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(3), YMM(R3)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(4), YMM( 5)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
KXNORW(K(3), K(0), K(0)) \
|
||||
KXNORW(K(4), K(0), K(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
|
||||
VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),8)) \
|
||||
VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),8)) \
|
||||
VFMADD231PS(YMM(R4), YMM(6), YMM(1)) \
|
||||
VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(3), YMM(R4)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(4), YMM( 5)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
LEA(RDX, MEM(RDX,RAX,1))
|
||||
|
||||
#define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R2)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
LEA(RDX, MEM(RDX,RAX,1)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(2),8) MASK_K(1), YMM(R3)) \
|
||||
VSCATTERQPS(MEM(RCX,ZMM(3),8) MASK_K(2), YMM( 5)) \
|
||||
\
|
||||
KXNORW(K(1), K(0), K(0)) \
|
||||
KXNORW(K(2), K(0), K(0)) \
|
||||
VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
|
||||
VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(2),8) MASK_K(1), YMM(R4)) \
|
||||
VSCATTERQPS(MEM(RDX,ZMM(3),8) MASK_K(2), YMM( 5)) \
|
||||
\
|
||||
LEA(RCX, MEM(RCX,RAX,1)) \
|
||||
LEA(RDX, MEM(RDX,RAX,1))
|
||||
|
||||
#ifdef PREFETCH_C_L2
|
||||
#undef PREFETCH_C_L2
|
||||
#define PREFETCH_C_L2 \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX, 0*64)) \
|
||||
PREFETCH(1, MEM(RCX, 1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,2,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R13,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R13,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R12,4,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R12,4,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R14,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R14,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R13,2,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R13,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RCX,R15,1,0*64)) \
|
||||
PREFETCH(1, MEM(RCX,R15,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX, 0*64)) \
|
||||
PREFETCH(1, MEM(RDX, 1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R12,1,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R12,1,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R12,2,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R12,2,1*64)) \
|
||||
\
|
||||
PREFETCH(1, MEM(RDX,R13,1,0*64)) \
|
||||
PREFETCH(1, MEM(RDX,R13,1,1*64))
|
||||
|
||||
#else
|
||||
#undef PREFETCH_C_L2
|
||||
#define PREFETCH_C_L2
|
||||
#endif
|
||||
|
||||
|
||||
#define PREFETCH_C_L1 \
|
||||
\
|
||||
PREFETCHW0(MEM(RCX, 0*64)) \
|
||||
PREFETCHW0(MEM(RCX, 1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,2,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,2,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,4,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R12,4,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R14,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R14,1,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,2,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R13,2,1*64)) \
|
||||
PREFETCHW0(MEM(RCX,R15,1,0*64)) \
|
||||
PREFETCHW0(MEM(RCX,R15,1,1*64)) \
|
||||
PREFETCHW0(MEM(RDX, 0*64)) \
|
||||
PREFETCHW0(MEM(RDX, 1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,1,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,1,1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,2,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R12,2,1*64)) \
|
||||
PREFETCHW0(MEM(RDX,R13,1,0*64)) \
|
||||
PREFETCHW0(MEM(RDX,R13,1,1*64))
|
||||
|
||||
//
|
||||
// n: index in unrolled loop
|
||||
//
|
||||
// a: ZMM register to load into
|
||||
// b: ZMM register to read from
|
||||
//
|
||||
// ...: addressing for B, except for offset
|
||||
//
|
||||
#define SUBITER(n) \
|
||||
\
|
||||
PREFETCH_A_L1(n, 0) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \
|
||||
VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \
|
||||
VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \
|
||||
VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
PREFETCH_A_L1(n, 1) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \
|
||||
VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \
|
||||
VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+10)*8)) \
|
||||
VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+11)*8)) \
|
||||
VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \
|
||||
VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \
|
||||
VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \
|
||||
\
|
||||
VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \
|
||||
VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8))
|
||||
|
||||
//This is an array used for the scatter/gather instructions.
|
||||
static int64_t offsets[16] __attribute__((aligned(64))) =
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
|
||||
|
||||
void bli_sgemm_skx_asm_32x12_l2(
|
||||
dim_t k_,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c_, inc_t cs_c_,
|
||||
auxinfo_t* data,
|
||||
cntx_t* restrict cntx
|
||||
)
|
||||
{
|
||||
(void)data;
|
||||
(void)cntx;
|
||||
|
||||
const int64_t* offsetPtr = &offsets[0];
|
||||
const int64_t k = k_;
|
||||
const int64_t rs_c = rs_c_;
|
||||
const int64_t cs_c = cs_c_;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
|
||||
VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
|
||||
VMOVAPD(YMM( 7), YMM(8))
|
||||
VMOVAPD(YMM( 9), YMM(8))
|
||||
VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index
|
||||
VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a
|
||||
VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b
|
||||
VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c
|
||||
VMOVAPD(YMM(14), YMM(8))
|
||||
VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*4)) //pre-load a
|
||||
VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 16*4)) //pre-load a
|
||||
VMOVAPD(YMM(17), YMM(8))
|
||||
VMOVAPD(YMM(18), YMM(8))
|
||||
VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c
|
||||
VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3
|
||||
VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5
|
||||
VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7
|
||||
VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c
|
||||
VMOVAPD(YMM(24), YMM(8))
|
||||
VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(32*4)) //mr*sizeof(float)
|
||||
VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*4)) //nr*sizeof(float)
|
||||
VMOVAPD(YMM(27), YMM(8))
|
||||
VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load
|
||||
VMOVAPD(YMM(29), YMM(8))
|
||||
VMOVAPD(YMM(30), YMM(8))
|
||||
VMOVAPD(YMM(31), YMM(8))
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
#ifdef PREFETCH_A_BEFORE
|
||||
/* Prefetching 8 cachlines of A (4 iterations worth of data
|
||||
(32 (MR) x4 (sizeof(float)) x4 iter /64 = 8 cachelines) */
|
||||
PREFETCH(0, MEM(RAX,0*64))
|
||||
PREFETCH(0, MEM(RAX,1*64))
|
||||
PREFETCH(0, MEM(RAX,2*64))
|
||||
PREFETCH(0, MEM(RAX,3*64))
|
||||
PREFETCH(0, MEM(RAX,4*64))
|
||||
PREFETCH(0, MEM(RAX,5*64))
|
||||
PREFETCH(0, MEM(RAX,6*64))
|
||||
PREFETCH(0, MEM(RAX,7*64))
|
||||
#endif
|
||||
|
||||
#ifdef PREFETCH_B_BEFORE
|
||||
/* Prefetching 3 cachlines of B (4 iterations worth of data
|
||||
(12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */
|
||||
PREFETCH(0, MEM(RBX,0*64))
|
||||
PREFETCH(0, MEM(RBX,1*64))
|
||||
PREFETCH(0, MEM(RBX,2*64))
|
||||
#endif
|
||||
|
||||
PREFETCH_C_L2
|
||||
|
||||
MOV(RDI, RSI)
|
||||
AND(RSI, IMM(3))
|
||||
SAR(RDI, IMM(2))
|
||||
|
||||
SUB(RDI, IMM(0+TAIL_NITER))
|
||||
JLE(K_SMALL)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(MAIN_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
|
||||
SUBITER(0)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
|
||||
SUBITER(1)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
|
||||
SUBITER(2)
|
||||
SUBITER(3)
|
||||
|
||||
LEA(RAX, MEM(RAX,R8,4))
|
||||
LEA(RBX, MEM(RBX,R9,4))
|
||||
|
||||
DEC(RDI)
|
||||
|
||||
JNZ(MAIN_LOOP)
|
||||
|
||||
LABEL(K_SMALL)
|
||||
|
||||
PREFETCH_C_L1
|
||||
|
||||
ADD(RDI, IMM(0+TAIL_NITER))
|
||||
JZ(TAIL_LOOP)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(SMALL_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
|
||||
SUBITER(0)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64))
|
||||
SUBITER(1)
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128))
|
||||
SUBITER(2)
|
||||
SUBITER(3)
|
||||
|
||||
LEA(RAX, MEM(RAX,R8,4))
|
||||
LEA(RBX, MEM(RBX,R9,4))
|
||||
|
||||
DEC(RDI)
|
||||
|
||||
JNZ(SMALL_LOOP)
|
||||
|
||||
TEST(RSI, RSI)
|
||||
JZ(POSTACCUM)
|
||||
|
||||
LOOP_ALIGN
|
||||
LABEL(TAIL_LOOP)
|
||||
|
||||
PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4))
|
||||
SUBITER(0)
|
||||
|
||||
ADD(RAX, R8)
|
||||
ADD(RBX, R9)
|
||||
|
||||
DEC(RSI)
|
||||
|
||||
JNZ(TAIL_LOOP)
|
||||
|
||||
|
||||
LABEL(POSTACCUM)
|
||||
|
||||
#ifdef PREFETCH_A_AFTER
|
||||
MOV(R8, VAR(a))
|
||||
PREFETCH(0, MEM(R8,0*64))
|
||||
PREFETCH(0, MEM(R8,1*64))
|
||||
PREFETCH(0, MEM(R8,2*64))
|
||||
PREFETCH(0, MEM(R8,3*64))
|
||||
PREFETCH(0, MEM(R8,4*64))
|
||||
PREFETCH(0, MEM(R8,5*64))
|
||||
PREFETCH(0, MEM(R8,6*64))
|
||||
PREFETCH(0, MEM(R8,7*64))
|
||||
#endif
|
||||
|
||||
#ifdef PREFETCH_B_AFTER
|
||||
MOV(R9, VAR(b))
|
||||
PREFETCH(0, MEM(R9,0*64))
|
||||
PREFETCH(0, MEM(R9,1*64))
|
||||
PREFETCH(0, MEM(R9,2*64))
|
||||
#endif
|
||||
|
||||
MOV(RAX, VAR(alpha))
|
||||
MOV(RBX, VAR(beta))
|
||||
VBROADCASTSS(ZMM(0), MEM(RAX))
|
||||
VBROADCASTSS(ZMM(1), MEM(RBX))
|
||||
|
||||
MOV(RAX, VAR(cs_c))
|
||||
LEA(RAX, MEM(,RAX,4))
|
||||
MOV(RBX, VAR(rs_c))
|
||||
LEA(RBX, MEM(,RBX,4))
|
||||
|
||||
|
||||
// Check if C is column major (rs_c = 1). If not, jump to the slow scattered update
|
||||
CMP(RBX, IMM(4))
|
||||
JNE(SCATTEREDUPDATE)
|
||||
|
||||
VCOMISD(XMM(1), XMM(7))
|
||||
JE(COLSTORBZ)
|
||||
|
||||
UPDATE_C( 8, 9,10,11)
|
||||
UPDATE_C(12,13,14,15)
|
||||
UPDATE_C(16,17,18,19)
|
||||
UPDATE_C(20,21,22,23)
|
||||
UPDATE_C(24,25,26,27)
|
||||
UPDATE_C(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(COLSTORBZ)
|
||||
|
||||
UPDATE_C_BZ( 8, 9,10,11)
|
||||
UPDATE_C_BZ(12,13,14,15)
|
||||
UPDATE_C_BZ(16,17,18,19)
|
||||
UPDATE_C_BZ(20,21,22,23)
|
||||
UPDATE_C_BZ(24,25,26,27)
|
||||
UPDATE_C_BZ(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(SCATTEREDUPDATE)
|
||||
|
||||
LEA(RDX, MEM(RCX,RBX,8))
|
||||
LEA(RDX, MEM(RDX,RBX,8))
|
||||
|
||||
MOV(RDI, VAR(offsetPtr))
|
||||
VMOVDQA64(ZMM(2), MEM(RDI,0*64))
|
||||
VMOVDQA64(ZMM(3), MEM(RDI,1*64))
|
||||
VPBROADCASTQ(ZMM(6), RBX)
|
||||
VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
|
||||
VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
|
||||
|
||||
VCOMISD(XMM(1), XMM(7))
|
||||
JE(SCATTERBZ)
|
||||
|
||||
UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
|
||||
UPDATE_C_ROW_SCATTERED(12,13,14,15)
|
||||
UPDATE_C_ROW_SCATTERED(16,17,18,19)
|
||||
UPDATE_C_ROW_SCATTERED(20,21,22,23)
|
||||
UPDATE_C_ROW_SCATTERED(24,25,26,27)
|
||||
UPDATE_C_ROW_SCATTERED(28,29,30,31)
|
||||
|
||||
JMP(END)
|
||||
LABEL(SCATTERBZ)
|
||||
|
||||
UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
|
||||
UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
|
||||
|
||||
LABEL(END)
|
||||
|
||||
VZEROUPPER()
|
||||
|
||||
: // output operands
|
||||
: // input operands
|
||||
[k] "m" (k),
|
||||
[a] "m" (a),
|
||||
[b] "m" (b),
|
||||
[alpha] "m" (alpha),
|
||||
[beta] "m" (beta),
|
||||
[c] "m" (c),
|
||||
[rs_c] "m" (rs_c),
|
||||
[cs_c] "m" (cs_c),
|
||||
[offsetPtr] "m" (offsetPtr)
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
|
||||
"r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
|
||||
"zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
|
||||
"zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
|
||||
"zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
|
||||
"zmm30", "zmm31", "memory"
|
||||
);
|
||||
}
|
||||
40
kernels/skx/bli_kernels_skx.h
Normal file
40
kernels/skx/bli_kernels_skx.h
Normal file
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
|
||||
BLIS
|
||||
An object-based framework for developing high-performance BLAS-like
|
||||
libraries.
|
||||
|
||||
Copyright (C) 2014, The University of Texas at Austin
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
- Neither the name of The University of Texas at Austin nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 )
|
||||
GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 )
|
||||
|
||||
GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 )
|
||||
|
||||
|
||||
Reference in New Issue
Block a user