mirror of
https://github.com/amd/blis.git
synced 2026-05-13 10:35:38 +00:00
Merge branch 'master' into const_correctness
This commit is contained in:
@@ -15,20 +15,19 @@ env:
|
||||
- RUN_TEST=0 BUILD_CONFIG="carrizo"
|
||||
|
||||
install:
|
||||
- if [ "$CC" = "gcc" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
|
||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.9"; fi
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- gcc-4.9
|
||||
- clang
|
||||
|
||||
|
||||
|
||||
script:
|
||||
- ./configure $BUILD_CONFIG
|
||||
- make CC=gcc-4.8
|
||||
- make CC=gcc-4.9
|
||||
- if [ $RUN_TEST -eq 1 ]; then make BLIS_ENABLE_TEST_OUTPUT=yes test; fi
|
||||
- if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi
|
||||
- if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi
|
||||
|
||||
99
Makefile
99
Makefile
@@ -138,6 +138,51 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Determine the compiler vendor --------------------------------------------
|
||||
#
|
||||
|
||||
ifneq ($(CC),)
|
||||
|
||||
VENDOR_STRING := $(shell $(CC) --version 2>/dev/null)
|
||||
ifeq ($(VENDOR_STRING),)
|
||||
VENDOR_STRING := $(shell $(CC) -qversion 2>/dev/null)
|
||||
endif
|
||||
ifeq ($(VENDOR_STRING),)
|
||||
$(error Unable to determine compiler vendor.)
|
||||
endif
|
||||
|
||||
CC_VENDOR := $(firstword $(shell echo '$(VENDOR_STRING)' | grep -Eo 'icc|gcc|clang|emcc|pnacl|IBM'))
|
||||
ifeq ($(CC_VENDOR),)
|
||||
$(error Unable to determine compiler vendor.)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Include makefile definitions file ----------------------------------------
|
||||
#
|
||||
@@ -159,6 +204,60 @@ endif
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Configuration-agnostic flags ---------------------------------------------
|
||||
#
|
||||
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
ifeq ($(THREADING_MODEL),auto)
|
||||
THREADING_MODEL := omp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),omp)
|
||||
CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP
|
||||
LDFLAGS += -fopenmp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),pthreads)
|
||||
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
|
||||
LDFLAGS += -pthread
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
ifeq ($(THREADING_MODEL),auto)
|
||||
THREADING_MODEL := omp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),omp)
|
||||
CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP
|
||||
LDFLAGS += -openmp
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),pthreads)
|
||||
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
|
||||
LDFLAGS += -pthread
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
ifeq ($(THREADING_MODEL),auto)
|
||||
THREADING_MODEL := pthreads
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),omp)
|
||||
$(error OpenMP is not supported with Clang.)
|
||||
endif
|
||||
ifeq ($(THREADING_MODEL),pthreads)
|
||||
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
|
||||
LDFLAGS += -pthread
|
||||
endif
|
||||
endif
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard compilation,
|
||||
# and one for each of the supported "special" compilation modes.
|
||||
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Adjust verbosity level manually using make V=[0,1] -----------------------
|
||||
#
|
||||
|
||||
@@ -37,26 +37,32 @@ ifndef CONFIG_MK_INCLUDED
|
||||
CONFIG_MK_INCLUDED := yes
|
||||
|
||||
# The name of the configuration sub-directory.
|
||||
CONFIG_NAME := @config_name@
|
||||
CONFIG_NAME := @config_name@
|
||||
|
||||
# The operating system name, which should be either 'Linux' or 'Darwin'.
|
||||
OS_NAME := $(shell uname -s)
|
||||
# The operatin g system name, which should be either 'Linux' or 'Darwin'.
|
||||
OS_NAME := $(shell uname -s)
|
||||
|
||||
# The directory path to the top level of the source distribution.
|
||||
DIST_PATH := @dist_path@
|
||||
DIST_PATH := @dist_path@
|
||||
|
||||
# The level of debugging info to generate.
|
||||
DEBUG_TYPE := @debug_type@
|
||||
DEBUG_TYPE := @debug_type@
|
||||
|
||||
# The C compiler.
|
||||
CC := @CC@
|
||||
CC_VENDOR := @cc_vendor@
|
||||
CC := @CC@
|
||||
|
||||
# The requested threading model.
|
||||
THREADING_MODEL := @threading_model@
|
||||
|
||||
# The install prefix tell us where to install the libraries and header file
|
||||
# directory. Notice that we support the use of DESTDIR so that advanced users
|
||||
# may install to a temporary location.
|
||||
INSTALL_PREFIX := $(DESTDIR)@install_prefix@
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@
|
||||
BLIS_ENABLE_STATIC_BUILD := @enable_static@
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := @enable_dynamic@
|
||||
|
||||
# end of ifndef CONFIG_MK_INCLUDED conditional block
|
||||
endif
|
||||
|
||||
@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
|
||||
CMISCFLAGS := -std=c99 -mfloat-abi=hard
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := #-msse3 # -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
CVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
|
||||
@@ -51,13 +51,13 @@
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 336
|
||||
#define BLIS_DEFAULT_KC_S 336
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336
|
||||
#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528
|
||||
#define BLIS_DEFAULT_NC_S 3072
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 160
|
||||
#define BLIS_DEFAULT_KC_D 304
|
||||
#define BLIS_DEFAULT_NC_D 4096
|
||||
#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176
|
||||
#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368
|
||||
#define BLIS_DEFAULT_NC_D 3072
|
||||
|
||||
#define BLIS_DEFAULT_MC_C 64
|
||||
#define BLIS_DEFAULT_KC_C 128
|
||||
@@ -69,11 +69,11 @@
|
||||
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 12
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 4
|
||||
#define BLIS_DEFAULT_MR_D 6
|
||||
#define BLIS_DEFAULT_NR_D 8
|
||||
|
||||
#define BLIS_DEFAULT_MR_C 8
|
||||
#define BLIS_DEFAULT_NR_C 4
|
||||
@@ -132,6 +132,8 @@
|
||||
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
|
||||
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
|
||||
|
||||
|
||||
|
||||
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
|
||||
|
||||
|
||||
@@ -146,8 +148,8 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
|
||||
@@ -38,63 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
ifeq ($(CC),)
|
||||
CC := gcc
|
||||
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_GNU_SOURCE
|
||||
CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g #-g3 -gdwarf-2
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -ftree-vectorize -mtune=cortex-a57.cortex-a53
|
||||
endif
|
||||
|
||||
CVECFLAGS := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -103,7 +77,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -fopenmp
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
|
||||
CC_VENDOR := IBM
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L \
|
||||
@@ -89,13 +57,6 @@ COPTFLAGS := -O3
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -51,9 +51,9 @@
|
||||
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
|
||||
//
|
||||
|
||||
#define BLIS_DEFAULT_MC_S 256
|
||||
#define BLIS_DEFAULT_KC_S 256
|
||||
#define BLIS_DEFAULT_NC_S 8192
|
||||
#define BLIS_DEFAULT_MC_S 128
|
||||
#define BLIS_DEFAULT_KC_S 384
|
||||
#define BLIS_DEFAULT_NC_S 4096
|
||||
|
||||
#define BLIS_DEFAULT_MC_D 1080
|
||||
#define BLIS_DEFAULT_KC_D 120
|
||||
@@ -70,7 +70,7 @@
|
||||
// -- Register blocksizes --
|
||||
|
||||
#define BLIS_DEFAULT_MR_S 8
|
||||
#define BLIS_DEFAULT_NR_S 4
|
||||
#define BLIS_DEFAULT_NR_S 8
|
||||
|
||||
#define BLIS_DEFAULT_MR_D 4
|
||||
#define BLIS_DEFAULT_NR_D 6
|
||||
@@ -149,6 +149,7 @@
|
||||
|
||||
// -- gemm --
|
||||
|
||||
#define BLIS_SGEMM_UKERNEL bli_sgemm_8x8_FMA4
|
||||
#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6_FMA4
|
||||
|
||||
// -- trsm-related --
|
||||
|
||||
@@ -1 +1 @@
|
||||
../../kernels/x86_64/bulldozer
|
||||
../../kernels/x86_64/bulldozer/
|
||||
@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -fopenmp
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := -g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O0 -malign-double -funroll-all-loops
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -malign-double -funroll-all-loops
|
||||
endif
|
||||
|
||||
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
|
||||
@@ -36,9 +36,6 @@
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
//#define BLIS_ENABLE_PTHREADS
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
|
||||
|
||||
@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -fopenmp
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
CVECFLAGS := -mavx -mfma -march=native
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifneq ($(DEBUG_TYPE),off)
|
||||
CDBGFLAGS := -g
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
CVECFLAGS := -mavx -mfma -march=bdver4 -mfpmath=sse
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -102,7 +77,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -fopenmp
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -97,22 +64,12 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -march=armv7-a -mfpu=neon -O2
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
COPTFLAGS := -O2
|
||||
endif
|
||||
|
||||
CVECFLAGS := -march=armv7-a
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -97,22 +64,12 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
COPTFLAGS := -O2
|
||||
endif
|
||||
|
||||
CVECFLAGS := -march=armv7-a
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -80,13 +47,11 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -97,21 +62,24 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := -msse3 -march=native
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CVECFLAGS := -msse3 -march=corei7 -mfpmath=sse
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CVECFLAGS := -xSSE4.2
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CVECFLAGS := -msse3 -mfpmath=sse -march=corei7
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
|
||||
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := emranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := emcc
|
||||
CC_VENDOR := emcc
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
@@ -88,13 +56,6 @@ COPTFLAGS := -O2
|
||||
CKOPTFLAGS := -O3
|
||||
CVECFLAGS :=
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := emar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -35,11 +35,6 @@
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
// Enable multithreading via POSIX threads.
|
||||
//#define BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Enable multithreading via OpenMP.
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -80,13 +47,11 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99 -m64
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -97,21 +62,24 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -march=native
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := -mavx2 -mfma -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CVECFLAGS := -xCORE-AVX2
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -120,7 +88,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -fopenmp -lpthread
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64
|
||||
CMISCFLAGS := -std=c99 -fopenmp #-pg
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -97,22 +64,12 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -march=loongson3a -mtune=loongson3a
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
COPTFLAGS := -O3 -mtune=loongson3a
|
||||
endif
|
||||
|
||||
CVECFLAGS := -march=loongson3a
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -39,7 +39,6 @@
|
||||
#define BLIS_TREE_BARRIER
|
||||
#define BLIS_TREE_BARRIER_ARITY 4
|
||||
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 32
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
|
||||
CMISCFLAGS := -mmic -fasm-blocks -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -100,19 +67,9 @@ else
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS :=
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
@@ -120,7 +77,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -mmic -lm -openmp
|
||||
LDFLAGS := -mmic -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -36,9 +36,6 @@
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
|
||||
//#define BLIS_ENABLE_PTHREADS
|
||||
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
|
||||
#define BLIS_SIMD_ALIGN_SIZE 16
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -fopenmp
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -97,22 +64,12 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := -mavx -mfma -march=native
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
endif
|
||||
|
||||
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
@@ -120,7 +77,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -fopenmp
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := pnacl-ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
|
||||
# --- Determine the C compiler and related flags ---
|
||||
CC := pnacl-clang
|
||||
CC_VENDOR := pnacl-clang
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
@@ -88,13 +56,6 @@ COPTFLAGS := -O3
|
||||
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
|
||||
CVECFLAGS :=
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := pnacl-ar
|
||||
ARFLAGS := rcs
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
|
||||
CMISCFLAGS := -std=c99 -m64 -mcpu=power7
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -100,19 +67,9 @@ else
|
||||
COPTFLAGS := -O3 -mtune=power7
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := -mvsx
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -100,19 +67,9 @@ else
|
||||
COPTFLAGS := -O2
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
endif
|
||||
|
||||
CVECFLAGS :=
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
@@ -35,12 +35,6 @@
|
||||
#ifndef BLIS_CONFIG_H
|
||||
#define BLIS_CONFIG_H
|
||||
|
||||
// Enable multithreading via POSIX threads.
|
||||
//#define BLIS_ENABLE_PTHREADS
|
||||
|
||||
// Enable multithreading via OpenMP.
|
||||
#define BLIS_ENABLE_OPENMP
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -80,13 +47,11 @@ ifeq ($(CC),)
|
||||
CC := gcc
|
||||
CC_VENDOR := gcc
|
||||
endif
|
||||
ifneq ($(CC_VENDOR),gcc)
|
||||
$(error gcc is required for this configuration.)
|
||||
endif
|
||||
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99 -m64
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -97,21 +62,24 @@ endif
|
||||
ifeq ($(DEBUG_TYPE),noopt)
|
||||
COPTFLAGS := -O0
|
||||
else
|
||||
COPTFLAGS := -O3 -march=native
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
|
||||
COPTFLAGS := -O3
|
||||
endif
|
||||
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
ifeq ($(CC_VENDOR),gcc)
|
||||
CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
|
||||
else
|
||||
ifeq ($(CC_VENDOR),icc)
|
||||
CVECFLAGS := -xAVX
|
||||
else
|
||||
ifeq ($(CC_VENDOR),clang)
|
||||
CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
|
||||
else
|
||||
$(error gcc, icc, or clang is required for this configuration.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
@@ -120,7 +88,7 @@ ARFLAGS := cru
|
||||
# --- Determine the linker and related flags ---
|
||||
LINKER := $(CC)
|
||||
SOFLAGS := -shared
|
||||
LDFLAGS := -lm -fopenmp -lpthread
|
||||
LDFLAGS := -lm
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Build definitions --------------------------------------------------------
|
||||
#
|
||||
|
||||
# Variables corresponding to other configure-time options.
|
||||
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
|
||||
BLIS_ENABLE_STATIC_BUILD := yes
|
||||
BLIS_ENABLE_DYNAMIC_BUILD := no
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Utility program definitions ----------------------------------------------
|
||||
#
|
||||
|
||||
SH := /bin/sh
|
||||
MV := mv
|
||||
MKDIR := mkdir -p
|
||||
RM_F := rm -f
|
||||
RM_RF := rm -rf
|
||||
SYMLINK := ln -sf
|
||||
FIND := find
|
||||
GREP := grep
|
||||
XARGS := xargs
|
||||
RANLIB := ranlib
|
||||
INSTALL := install -c
|
||||
|
||||
# Used to refresh CHANGELOG.
|
||||
GIT := git
|
||||
GIT_LOG := $(GIT) log --decorate
|
||||
|
||||
|
||||
|
||||
#
|
||||
# --- Development tools definitions --------------------------------------------
|
||||
#
|
||||
@@ -86,7 +53,7 @@ endif
|
||||
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
|
||||
# NOTE: This is needed to enable posix_memalign().
|
||||
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CMISCFLAGS := -std=c99
|
||||
CPICFLAGS := -fPIC
|
||||
CWARNFLAGS := -Wall
|
||||
|
||||
@@ -100,19 +67,9 @@ else
|
||||
COPTFLAGS := -O2
|
||||
endif
|
||||
|
||||
ifneq ($(DEBUG_TYPE),noopt)
|
||||
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
|
||||
endif
|
||||
|
||||
CVECFLAGS := #-msse3 -march=core2 # -mfpmath=sse
|
||||
CKOPTFLAGS := $(COPTFLAGS)
|
||||
|
||||
# Aggregate all of the flags into multiple groups: one for standard
|
||||
# compilation, and one for each of the supported "special" compilation
|
||||
# modes.
|
||||
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
|
||||
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
|
||||
|
||||
# --- Determine the archiver and related flags ---
|
||||
AR := ar
|
||||
ARFLAGS := cru
|
||||
|
||||
125
configure
vendored
125
configure
vendored
@@ -73,6 +73,29 @@ print_usage()
|
||||
echo " kept in the framework, otherwise optimization is"
|
||||
echo " turned off."
|
||||
echo " "
|
||||
echo " --enable-verbose-make, --disable-verbose-make"
|
||||
echo " "
|
||||
echo " Enable (disabled by default) verbose compilation"
|
||||
echo " output during make."
|
||||
echo " "
|
||||
echo " --disable-static, --enable-static"
|
||||
echo " "
|
||||
echo " Disable (enabled by default) building BLIS as a static"
|
||||
echo " library. May be combined with --enable-shared."
|
||||
echo " "
|
||||
echo " --enable-shared, --disable-static"
|
||||
echo " "
|
||||
echo " Enable (disabled by default) building BLIS as a shared"
|
||||
echo " library. May be combined with --enable-static."
|
||||
echo " "
|
||||
echo " -t MODEL, --enable-threading[=MODEL], --disable-threading"
|
||||
echo " "
|
||||
echo " Enable threading in the library, using threading model"
|
||||
echo " MODEL={auto,omp,pthreads,no}. If MODEL=no or "
|
||||
echo " --disable-threading is specified, threading will be"
|
||||
echo " disabled. If MODEL=auto or is unspecified, a model"
|
||||
echo " will be chosen automatically. The default is 'auto'."
|
||||
echo " "
|
||||
echo " -q, --quiet Suppress informational output. By default, configure"
|
||||
echo " is verbose. (NOTE: -q is not yet implemented)"
|
||||
echo " "
|
||||
@@ -85,7 +108,7 @@ print_usage()
|
||||
echo " Environment variables may also be specified as command line"
|
||||
echo " options, e.g.:"
|
||||
echo " "
|
||||
echo " ./configure CC=gcc sandybridge"
|
||||
echo " ./configure [options] CC=gcc sandybridge"
|
||||
echo " "
|
||||
echo " Note that not all compilers are compatible with a given"
|
||||
echo " configuration."
|
||||
@@ -164,8 +187,16 @@ main()
|
||||
debug_type=''
|
||||
debug_flag=''
|
||||
|
||||
# The threading flag.
|
||||
threading_model='auto'
|
||||
|
||||
# Option variables.
|
||||
quiet_flag=''
|
||||
|
||||
# Additional flags.
|
||||
enable_verbose='no'
|
||||
enable_static='yes'
|
||||
enable_shared='no'
|
||||
|
||||
# The path to the auto-detection script.
|
||||
auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh"
|
||||
@@ -190,7 +221,7 @@ main()
|
||||
|
||||
|
||||
# Process our command line options.
|
||||
while getopts ":hp:d:q-:" opt; do
|
||||
while getopts ":hp:d:t:q-:" opt; do
|
||||
case $opt in
|
||||
-)
|
||||
case "$OPTARG" in
|
||||
@@ -212,6 +243,36 @@ main()
|
||||
debug_flag=1
|
||||
debug_type=${OPTARG#*=}
|
||||
;;
|
||||
disable-debug)
|
||||
debug_flag=0
|
||||
;;
|
||||
enable-verbose-make)
|
||||
enable_verbose='yes'
|
||||
;;
|
||||
disable-verbose-make)
|
||||
enable_verbose='no'
|
||||
;;
|
||||
enable-static)
|
||||
enable_static='yes'
|
||||
;;
|
||||
disable-static)
|
||||
enable_static='no'
|
||||
;;
|
||||
enable-shared)
|
||||
enable_shared='yes'
|
||||
;;
|
||||
disable-shared)
|
||||
enable_shared='no'
|
||||
;;
|
||||
enable-threading)
|
||||
threading_model='auto'
|
||||
;;
|
||||
enable-threading=*)
|
||||
threading_model=${OPTARG#*=}
|
||||
;;
|
||||
disable-threading)
|
||||
threading_model='no'
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -230,6 +291,9 @@ main()
|
||||
q)
|
||||
quiet_flag=1
|
||||
;;
|
||||
t)
|
||||
threading_model=$OPTARG
|
||||
;;
|
||||
\?)
|
||||
print_usage
|
||||
;;
|
||||
@@ -338,23 +402,45 @@ main()
|
||||
debug_type='off'
|
||||
echo "${script_name}: debug symbols disabled."
|
||||
fi
|
||||
|
||||
|
||||
# Check if the verbose make flag was specified.
|
||||
if [ "x${enable_verbose}" = "xyes" ]; then
|
||||
echo "${script_name}: enabling verbose make output, disable with 'make V=0'."
|
||||
else
|
||||
echo "${script_name}: disabling verbose make output, enable with 'make V=1'."
|
||||
fi
|
||||
|
||||
|
||||
# Check if the static lib flag was specified.
|
||||
if [ "x${enable_static}" = "xyes" ]; then
|
||||
echo "${script_name}: building BLIS as a static library."
|
||||
fi
|
||||
|
||||
# Check if the shared lib flag was specified.
|
||||
if [ "x${enable_shared}" = "xyes" ]; then
|
||||
echo "${script_name}: building BLIS as a shared library."
|
||||
fi
|
||||
|
||||
# Check if neither flag was specified.
|
||||
if [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xno" ]; then
|
||||
echo "Neither a shared nor static library build has been requested."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# Determine the compiler vendor if CC was specified.
|
||||
if [ -n "$CC" ]; then
|
||||
if $CC --version 2>/dev/null | grep -q 'pnacl-version'; then
|
||||
cc_vendor='pnacl-clang'
|
||||
else
|
||||
cc_vendor=`$CC --version 2>/dev/null | grep -Eo 'icc|gcc|clang|emcc'`
|
||||
fi
|
||||
if [ -z "$cc_vendor" ]; then
|
||||
cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'`
|
||||
fi
|
||||
if [ -z "$cc_vendor" ]; then
|
||||
echo Unable to determine compiler vendor.
|
||||
exit 1
|
||||
fi
|
||||
cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }`
|
||||
# Check the threading model flag.
|
||||
if [ "x${threading_model}" = "xauto" ]; then
|
||||
echo "${script_name}: determining the threading model automatically."
|
||||
elif [ "x${threading_model}" = "xomp" ]; then
|
||||
echo "${script_name}: using OpenMP for threading."
|
||||
elif [ "x${threading_model}" = "xpthreads" ]; then
|
||||
echo "${script_name}: using Pthreads for threading."
|
||||
elif [ "x${threading_model}" = "xno" ]; then
|
||||
echo "${script_name}: threading is disabled."
|
||||
else
|
||||
echo "Unsupported threading model: ${threading_model}."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
@@ -373,9 +459,12 @@ main()
|
||||
| sed "s/@config_name@/${config_name}/g" \
|
||||
| sed "s/@dist_path@/${dist_path_esc}/g" \
|
||||
| sed "s/@CC@/${cc_esc}/g" \
|
||||
| sed "s/@cc_vendor@/${cc_vendor}/g" \
|
||||
| sed "s/@debug_type@/${debug_type}/g" \
|
||||
| sed "s/@install_prefix@/${install_prefix_esc}/g" \
|
||||
| sed "s/@enable_verbose@/${enable_verbose}/g" \
|
||||
| sed "s/@enable_static@/${enable_static}/g" \
|
||||
| sed "s/@enable_dynamic@/${enable_shared}/g" \
|
||||
| sed "s/@threading_model@/${threading_model}/g" \
|
||||
> "${config_mk_out_path}"
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -34,6 +34,933 @@
|
||||
|
||||
#include "blis.h"
|
||||
|
||||
void bli_sgemm_8x8_FMA4(
|
||||
dim_t k,
|
||||
float* restrict alpha,
|
||||
float* restrict a,
|
||||
float* restrict b,
|
||||
float* restrict beta,
|
||||
float* restrict c, inc_t rs_c, inc_t cs_c,
|
||||
auxinfo_t* data
|
||||
)
|
||||
{
|
||||
dim_t k_iter = k / 4;
|
||||
dim_t k_left = k % 4;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
"movq %2, %%rax \n\t" // load address of a.
|
||||
"movq %3, %%rbx \n\t" // load address of b.
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
|
||||
"vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
" \n\t"
|
||||
"movq %6, %%rcx \n\t" // load address of c
|
||||
"movq %8, %%rdi \n\t" // load cs_c
|
||||
"leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float)
|
||||
"leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c;
|
||||
" \n\t"
|
||||
"leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c;
|
||||
"prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
|
||||
"prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
|
||||
"prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c
|
||||
"prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c
|
||||
"prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c
|
||||
"prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c
|
||||
"prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c
|
||||
"prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c
|
||||
" \n\t"
|
||||
"vxorps %%ymm8, %%ymm8, %%ymm8 \n\t"
|
||||
"vxorps %%ymm9, %%ymm9, %%ymm9 \n\t"
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t"
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t"
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t"
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
|
||||
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
|
||||
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %0, %%rsi \n\t" // i = k_iter;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
|
||||
" \n\t" // contains the k_left loop.
|
||||
" \n\t"
|
||||
".SLOOPKITER: \n\t" // MAIN LOOP
|
||||
" \n\t"
|
||||
" \n\t" // iteration 0
|
||||
"prefetcht0 16 * 32(%%rax) \n\t"
|
||||
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
|
||||
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9 \n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
|
||||
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 1
|
||||
"vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t"
|
||||
"vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t"
|
||||
"vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 2
|
||||
"prefetcht0 18 * 32(%%rax) \n\t"
|
||||
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
|
||||
"addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr)
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
|
||||
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
|
||||
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
|
||||
" \n\t"
|
||||
" \n\t" // iteration 3
|
||||
"vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr)
|
||||
"vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t"
|
||||
"vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t"
|
||||
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t"
|
||||
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t"
|
||||
"vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .SLOOPKITER \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCONSIDKLEFT: \n\t"
|
||||
" \n\t"
|
||||
"movq %1, %%rsi \n\t" // i = k_left;
|
||||
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
|
||||
"je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
|
||||
" \n\t" // else, we prepare to enter k_left loop.
|
||||
" \n\t"
|
||||
".SLOOPKLEFT: \n\t" // EDGE LOOP
|
||||
" \n\t"
|
||||
"prefetcht0 16 * 32(%%rax) \n\t"
|
||||
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
|
||||
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
|
||||
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
|
||||
"addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr)
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
|
||||
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
|
||||
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
|
||||
"vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
|
||||
"addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr)
|
||||
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
|
||||
"vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
|
||||
" \n\t"
|
||||
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
|
||||
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
|
||||
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
|
||||
"vmovaps %%ymm1, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"decq %%rsi \n\t" // i -= 1;
|
||||
"jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SPOSTACCUM: \n\t"
|
||||
" \n\t" // ymm15: ymm13: ymm11: ymm9:
|
||||
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
|
||||
" \n\t" // ab10 ab12 ab14 ab16
|
||||
" \n\t" // ab22 ab20 ab26 ab24
|
||||
" \n\t" // ab32 ab30 ab36 ab34
|
||||
" \n\t" // ab44 ab46 ab40 ab42
|
||||
" \n\t" // ab54 ab56 ab50 ab52
|
||||
" \n\t" // ab66 ab64 ab62 ab60
|
||||
" \n\t" // ab76 ) ab74 ) ab72 ) ab70 )
|
||||
" \n\t"
|
||||
" \n\t" // ymm14: ymm12: ymm10: ymm8:
|
||||
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
|
||||
" \n\t" // ab11 ab13 ab15 ab17
|
||||
" \n\t" // ab23 ab21 ab27 ab25
|
||||
" \n\t" // ab33 ab31 ab37 ab35
|
||||
" \n\t" // ab45 ab47 ab41 ab43
|
||||
" \n\t" // ab55 ab57 ab51 ab53
|
||||
" \n\t" // ab67 ab65 ab63 ab61
|
||||
" \n\t" // ab77 ) ab75 ) ab73 ) ab71 )
|
||||
"vmovaps %%ymm15, %%ymm7 \n\t"
|
||||
"vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t"
|
||||
"vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm11, %%ymm7 \n\t"
|
||||
"vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t"
|
||||
"vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, %%ymm7 \n\t"
|
||||
"vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t"
|
||||
"vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, %%ymm7 \n\t"
|
||||
"vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t"
|
||||
"vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t"
|
||||
" \n\t" // ymm15: ymm13: ymm11: ymm9:
|
||||
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
|
||||
" \n\t" // ab10 ab12 ab14 ab16
|
||||
" \n\t" // ab20 ab22 ab24 ab26
|
||||
" \n\t" // ab30 ab32 ab34 ab36
|
||||
" \n\t" // ab44 ab46 ab40 ab42
|
||||
" \n\t" // ab54 ab56 ab50 ab52
|
||||
" \n\t" // ab64 ab66 ab60 ab62
|
||||
" \n\t" // ab74 ) ab76 ) ab70 ) ab72 )
|
||||
" \n\t"
|
||||
" \n\t" // ymm14: ymm12: ymm10: ymm8:
|
||||
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
|
||||
" \n\t" // ab11 ab13 ab15 ab17
|
||||
" \n\t" // ab21 ab23 ab25 ab27
|
||||
" \n\t" // ab31 ab33 ab35 ab37
|
||||
" \n\t" // ab45 ab47 ab41 ab43
|
||||
" \n\t" // ab55 ab57 ab51 ab53
|
||||
" \n\t" // ab65 ab67 ab61 ab63
|
||||
" \n\t" // ab75 ) ab77 ) ab71 ) ab73 )
|
||||
"vmovaps %%ymm15, %%ymm7 \n\t"
|
||||
"vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t"
|
||||
"vperm2f128 $0x12, %%ymm11, %%ymm7, %%ymm11 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, %%ymm7 \n\t"
|
||||
"vperm2f128 $0x30, %%ymm9, %%ymm13, %%ymm13 \n\t"
|
||||
"vperm2f128 $0x12, %%ymm9, %%ymm7, %%ymm9 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, %%ymm7 \n\t"
|
||||
"vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t"
|
||||
"vperm2f128 $0x12, %%ymm10, %%ymm7, %%ymm10 \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, %%ymm7 \n\t"
|
||||
"vperm2f128 $0x30, %%ymm8, %%ymm12, %%ymm12 \n\t"
|
||||
"vperm2f128 $0x12, %%ymm8, %%ymm7, %%ymm8 \n\t"
|
||||
" \n\t" // ymm15: ymm13: ymm11: ymm9:
|
||||
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
|
||||
" \n\t" // ab10 ab12 ab14 ab16
|
||||
" \n\t" // ab20 ab22 ab24 ab26
|
||||
" \n\t" // ab30 ab32 ab34 ab36
|
||||
" \n\t" // ab40 ab42 ab44 ab46
|
||||
" \n\t" // ab50 ab52 ab54 ab56
|
||||
" \n\t" // ab60 ab62 ab64 ab66
|
||||
" \n\t" // ab70 ) ab72 ) ab74 ) ab76 )
|
||||
" \n\t"
|
||||
" \n\t" // ymm14: ymm12: ymm10: ymm8:
|
||||
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
|
||||
" \n\t" // ab11 ab13 ab15 ab17
|
||||
" \n\t" // ab21 ab23 ab25 ab27
|
||||
" \n\t" // ab31 ab33 ab35 ab37
|
||||
" \n\t" // ab41 ab43 ab45 ab47
|
||||
" \n\t" // ab51 ab53 ab55 ab57
|
||||
" \n\t" // ab61 ab63 ab65 ab67
|
||||
" \n\t" // ab71 ) ab73 ) ab75 ) ab77 )
|
||||
" \n\t"
|
||||
"movq %4, %%rax \n\t" // load address of alpha
|
||||
"movq %5, %%rbx \n\t" // load address of beta
|
||||
"vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate
|
||||
"vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate
|
||||
" \n\t"
|
||||
"vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha
|
||||
"vmulps %%ymm0, %%ymm9, %%ymm9 \n\t"
|
||||
"vmulps %%ymm0, %%ymm10, %%ymm10 \n\t"
|
||||
"vmulps %%ymm0, %%ymm11, %%ymm11 \n\t"
|
||||
"vmulps %%ymm0, %%ymm12, %%ymm12 \n\t"
|
||||
"vmulps %%ymm0, %%ymm13, %%ymm13 \n\t"
|
||||
"vmulps %%ymm0, %%ymm14, %%ymm14 \n\t"
|
||||
"vmulps %%ymm0, %%ymm15, %%ymm15 \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"movq %7, %%rsi \n\t" // load rs_c
|
||||
"leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float)
|
||||
" \n\t"
|
||||
"leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
|
||||
" \n\t"
|
||||
"leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c;
|
||||
"leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // determine if
|
||||
" \n\t" // c % 32 == 0, AND
|
||||
" \n\t" // 4*cs_c % 32 == 0, AND
|
||||
" \n\t" // rs_c == 1
|
||||
" \n\t" // ie: aligned, ldim aligned, and
|
||||
" \n\t" // column-stored
|
||||
" \n\t"
|
||||
"cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4.
|
||||
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
|
||||
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
|
||||
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
|
||||
"testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero.
|
||||
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
|
||||
" \n\t" // and(bl,bh) followed by
|
||||
" \n\t" // and(bh,al) will reveal result
|
||||
" \n\t"
|
||||
" \n\t" // now avoid loading C if beta == 0
|
||||
" \n\t"
|
||||
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
|
||||
"vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0.
|
||||
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .SCOLSTORED \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SGENSTORED: \n\t"
|
||||
" \n\t" // update c00:c70
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vfmaddps %%ymm15, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t" // update c01:c71
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c02:c72
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c03:c73
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c04:c74
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c05:c75
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c06:c76
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c07:c77
|
||||
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
|
||||
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
|
||||
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
|
||||
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
|
||||
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
|
||||
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
|
||||
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
|
||||
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
|
||||
" \n\t"
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
" \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCOLSTORED: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70,
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71,
|
||||
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
|
||||
"vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72,
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73,
|
||||
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
|
||||
"vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74,
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75,
|
||||
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
|
||||
"vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76,
|
||||
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
|
||||
"vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77,
|
||||
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
|
||||
"vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result,
|
||||
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SBETAZERO: \n\t"
|
||||
" \n\t" // check if aligned/column-stored
|
||||
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
|
||||
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
|
||||
"jne .SCOLSTORBZ \n\t" // jump to column storage case
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SGENSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c00:c70
|
||||
"vmovapd %%ymm15, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c01:c71
|
||||
"vmovapd %%ymm14, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c02:c72
|
||||
"vmovapd %%ymm13, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c03:c73
|
||||
"vmovapd %%ymm12, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c04:c74
|
||||
"vmovapd %%ymm11, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c05:c75
|
||||
"vmovapd %%ymm10, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c06:c76
|
||||
"vmovapd %%ymm9, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
"addq %%rdi, %%rdx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t" // update c07:c77
|
||||
"vmovapd %%ymm8, %%ymm0 \n\t"
|
||||
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
|
||||
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
|
||||
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
|
||||
"vmovss %%xmm2, (%%rdx) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
|
||||
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
|
||||
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
|
||||
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
|
||||
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"jmp .SDONE \n\t" // jump to end.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SCOLSTORBZ: \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory.
|
||||
"addq %%rdi, %%rcx \n\t" // c += cs_c;
|
||||
" \n\t"
|
||||
"vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory.
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
".SDONE: \n\t"
|
||||
" \n\t"
|
||||
|
||||
: // output operands (none)
|
||||
: // input operands
|
||||
"m" (k_iter), // 0
|
||||
"m" (k_left), // 1
|
||||
"m" (a), // 2
|
||||
"m" (b), // 3
|
||||
"m" (alpha), // 4
|
||||
"m" (beta), // 5
|
||||
"m" (c), // 6
|
||||
"m" (rs_c), // 7
|
||||
"m" (cs_c)/*, // 8
|
||||
"m" (b_next), // 9
|
||||
"m" (a_next)*/ // 10
|
||||
: // register clobber list
|
||||
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
|
||||
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3",
|
||||
"xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9", "xmm10", "xmm11",
|
||||
"xmm12", "xmm13", "xmm14", "xmm15",
|
||||
"memory"
|
||||
);
|
||||
}
|
||||
|
||||
#undef KERNEL4x6_1
|
||||
#undef KERNEL4x6_2
|
||||
#undef KERNEL4x6_3
|
||||
|
||||
Reference in New Issue
Block a user