Merge branch 'master' into const_correctness

This commit is contained in:
Devin Matthews
2016-03-29 15:24:25 -05:00
32 changed files with 3006 additions and 1294 deletions

View File

@@ -15,20 +15,19 @@ env:
- RUN_TEST=0 BUILD_CONFIG="carrizo"
install:
- if [ "$CC" = "gcc" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.9"; fi
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8
- gcc-4.9
- clang
script:
- ./configure $BUILD_CONFIG
- make CC=gcc-4.8
- make CC=gcc-4.9
- if [ $RUN_TEST -eq 1 ]; then make BLIS_ENABLE_TEST_OUTPUT=yes test; fi
- if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi
- if [ $RUN_TEST -eq 1 ]; then ./build/check-test.sh ./output.testsuite; fi

View File

@@ -138,6 +138,51 @@ BASE_LIB_PATH := ./$(LIB_DIR)/$(CONFIG_NAME)
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Determine the compiler vendor --------------------------------------------
#
ifneq ($(CC),)
VENDOR_STRING := $(shell $(CC) --version 2>/dev/null)
ifeq ($(VENDOR_STRING),)
VENDOR_STRING := $(shell $(CC) -qversion 2>/dev/null)
endif
ifeq ($(VENDOR_STRING),)
$(error Unable to determine compiler vendor.)
endif
CC_VENDOR := $(firstword $(shell echo '$(VENDOR_STRING)' | grep -Eo 'icc|gcc|clang|emcc|pnacl|IBM'))
ifeq ($(CC_VENDOR),)
$(error Unable to determine compiler vendor.)
endif
endif
#
# --- Include makefile definitions file ----------------------------------------
#
@@ -159,6 +204,60 @@ endif
#
# --- Configuration-agnostic flags ---------------------------------------------
#
ifeq ($(CC_VENDOR),gcc)
ifeq ($(THREADING_MODEL),auto)
THREADING_MODEL := omp
endif
ifeq ($(THREADING_MODEL),omp)
CTHREADFLAGS := -fopenmp -DBLIS_ENABLE_OPENMP
LDFLAGS += -fopenmp
endif
ifeq ($(THREADING_MODEL),pthreads)
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
LDFLAGS += -pthread
endif
endif
ifeq ($(CC_VENDOR),icc)
ifeq ($(THREADING_MODEL),auto)
THREADING_MODEL := omp
endif
ifeq ($(THREADING_MODEL),omp)
CTHREADFLAGS := -openmp -DBLIS_ENABLE_OPENMP
LDFLAGS += -openmp
endif
ifeq ($(THREADING_MODEL),pthreads)
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
LDFLAGS += -pthread
endif
endif
ifeq ($(CC_VENDOR),clang)
ifeq ($(THREADING_MODEL),auto)
THREADING_MODEL := pthreads
endif
ifeq ($(THREADING_MODEL),omp)
$(error OpenMP is not supported with Clang.)
endif
ifeq ($(THREADING_MODEL),pthreads)
CTHREADFLAGS := -pthread -DBLIS_ENABLE_PTHREADS
LDFLAGS += -pthread
endif
endif
# Aggregate all of the flags into multiple groups: one for standard compilation,
# and one for each of the supported "special" compilation modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CTHREADFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
#
# --- Adjust verbosity level manually using make V=[0,1] -----------------------
#

View File

@@ -37,26 +37,32 @@ ifndef CONFIG_MK_INCLUDED
CONFIG_MK_INCLUDED := yes
# The name of the configuration sub-directory.
CONFIG_NAME := @config_name@
CONFIG_NAME := @config_name@
# The operating system name, which should be either 'Linux' or 'Darwin'.
OS_NAME := $(shell uname -s)
# The operatin g system name, which should be either 'Linux' or 'Darwin'.
OS_NAME := $(shell uname -s)
# The directory path to the top level of the source distribution.
DIST_PATH := @dist_path@
DIST_PATH := @dist_path@
# The level of debugging info to generate.
DEBUG_TYPE := @debug_type@
DEBUG_TYPE := @debug_type@
# The C compiler.
CC := @CC@
CC_VENDOR := @cc_vendor@
CC := @CC@
# The requested threading model.
THREADING_MODEL := @threading_model@
# The install prefix tell us where to install the libraries and header file
# directory. Notice that we support the use of DESTDIR so that advanced users
# may install to a temporary location.
INSTALL_PREFIX := $(DESTDIR)@install_prefix@
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := @enable_verbose@
BLIS_ENABLE_STATIC_BUILD := @enable_static@
BLIS_ENABLE_DYNAMIC_BUILD := @enable_dynamic@
# end of ifndef CONFIG_MK_INCLUDED conditional block
endif

View File

@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -O3 -mfloat-abi=hard -mfpu=vfpv3 -marm -march=armv7-a #-g
CMISCFLAGS := -std=c99 -mfloat-abi=hard
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -marm -march=armv7-a -mfpu=vfpv3 -O3 -mfloat-abi=hard #-g
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-msse3 # -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
CVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a
CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar

View File

@@ -51,13 +51,13 @@
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DEFAULT_MC_S 336
#define BLIS_DEFAULT_KC_S 336
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336
#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528
#define BLIS_DEFAULT_NC_S 3072
#define BLIS_DEFAULT_MC_D 160
#define BLIS_DEFAULT_KC_D 304
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176
#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368
#define BLIS_DEFAULT_NC_D 3072
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
@@ -69,11 +69,11 @@
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 12
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
@@ -132,6 +132,8 @@
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
@@ -146,8 +148,8 @@
// -- gemm --
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8
// -- trsm-related --

View File

@@ -38,63 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := yes
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS := -std=c99 -march=armv8-a+fp+simd -ftree-vectorize -O3 -fopenmp -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CDBGFLAGS := -g #-g3 -gdwarf-2
CWARNFLAGS := -Wall
COPTFLAGS := -march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := #-march=armv8-a+fp+simd -ftree-vectorize -O3 -mcpu=cortex-a57.cortex-a53 -mtune=cortex-a57.cortex-a53 #-march=armv8-a -O2 -mtune=cortex-a57 -mfpu=neon-fp-armv8
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -mtune=cortex-a57.cortex-a53
endif
CVECFLAGS := -march=armv8-a+fp+simd -mcpu=cortex-a57.cortex-a53
CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -103,7 +77,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -fopenmp
LDFLAGS := -lm

View File

@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
CC_VENDOR := IBM
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L \
@@ -89,13 +57,6 @@ COPTFLAGS := -O3
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -51,9 +51,9 @@
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#define BLIS_DEFAULT_MC_S 128
#define BLIS_DEFAULT_KC_S 384
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
@@ -70,7 +70,7 @@
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_NR_S 8
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 6
@@ -149,6 +149,7 @@
// -- gemm --
#define BLIS_SGEMM_UKERNEL bli_sgemm_8x8_FMA4
#define BLIS_DGEMM_UKERNEL bli_dgemm_4x6_FMA4
// -- trsm-related --

View File

@@ -1 +1 @@
../../kernels/x86_64/bulldozer
../../kernels/x86_64/bulldozer/

View File

@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -fopenmp
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CDBGFLAGS := -g
CWARNFLAGS := -Wall
COPTFLAGS := -O0 -malign-double -funroll-all-loops
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -malign-double -funroll-all-loops
endif
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar

View File

@@ -36,9 +36,6 @@
#define BLIS_CONFIG_H
//#define BLIS_ENABLE_PTHREADS
#define BLIS_ENABLE_OPENMP
#define BLIS_SIMD_ALIGN_SIZE 16

View File

@@ -38,62 +38,37 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -fopenmp
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
CKOPTFLAGS := $(COPTFLAGS)
CVECFLAGS := -mavx -mfma -march=native
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
endif
CVECFLAGS := -mavx -mfma -march=bdver4 -mfpmath=sse
CKOPTFLAGS := $(COPTFLAGS)
# --- Determine the archiver and related flags ---
AR := ar
@@ -102,7 +77,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -fopenmp
LDFLAGS := -lm

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -97,22 +64,12 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -march=armv7-a -mfpu=neon -O2
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
COPTFLAGS := -O2
endif
CVECFLAGS := -march=armv7-a
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -97,22 +64,12 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -march=armv7-a -mfpu=neon -O2 -mfloat-abi=hard
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
COPTFLAGS := -O2
endif
CVECFLAGS := -march=armv7-a
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -80,13 +47,11 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -97,21 +62,24 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := -msse3 -march=native
COPTFLAGS := -O2 -fomit-frame-pointer
endif
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifeq ($(CC_VENDOR),gcc)
CVECFLAGS := -msse3 -march=corei7 -mfpmath=sse
else
ifeq ($(CC_VENDOR),icc)
CVECFLAGS := -xSSE4.2
else
ifeq ($(CC_VENDOR),clang)
CVECFLAGS := -msse3 -mfpmath=sse -march=corei7
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# --- Determine the archiver and related flags ---
AR := ar

View File

@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := emranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := emcc
CC_VENDOR := emcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
@@ -88,13 +56,6 @@ COPTFLAGS := -O2
CKOPTFLAGS := -O3
CVECFLAGS :=
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := emar
ARFLAGS := cru

View File

@@ -35,11 +35,6 @@
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// Enable multithreading via POSIX threads.
//#define BLIS_ENABLE_PTHREADS
// Enable multithreading via OpenMP.
#define BLIS_ENABLE_OPENMP

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -80,13 +47,11 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
CMISCFLAGS := -std=c99 -m64
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -97,21 +62,24 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -march=native
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := -mavx2 -mfma -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
COPTFLAGS := -O3
endif
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifeq ($(CC_VENDOR),gcc)
CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
else
ifeq ($(CC_VENDOR),icc)
CVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
CVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# --- Determine the archiver and related flags ---
AR := ar
@@ -120,7 +88,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -fopenmp -lpthread
LDFLAGS := -lm

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64
CMISCFLAGS := -std=c99 -fopenmp #-pg
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -97,22 +64,12 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -march=loongson3a -mtune=loongson3a
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
COPTFLAGS := -O3 -mtune=loongson3a
endif
CVECFLAGS := -march=loongson3a
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -39,7 +39,6 @@
#define BLIS_TREE_BARRIER
#define BLIS_TREE_BARRIER_ARITY 4
#define BLIS_ENABLE_OPENMP
#define BLIS_SIMD_ALIGN_SIZE 32

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -mmic -fasm-blocks -std=c99 -openmp
CMISCFLAGS := -mmic -fasm-blocks -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -100,19 +67,9 @@ else
COPTFLAGS := -O3
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS :=
endif
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
@@ -120,7 +77,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -mmic -lm -openmp
LDFLAGS := -mmic -lm

View File

@@ -36,9 +36,6 @@
#define BLIS_CONFIG_H
//#define BLIS_ENABLE_PTHREADS
#define BLIS_ENABLE_OPENMP
#define BLIS_SIMD_ALIGN_SIZE 16

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -fopenmp
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -97,22 +64,12 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mfpmath=sse -fomit-frame-pointer
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := -mavx -mfma -march=native
COPTFLAGS := -O2 -fomit-frame-pointer
endif
CVECFLAGS := -mavx -mfma -march=bdver2 -mfpmath=sse
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru
@@ -120,7 +77,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -fopenmp
LDFLAGS := -lm

View File

@@ -38,45 +38,13 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := pnacl-ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := pnacl-clang
CC_VENDOR := pnacl-clang
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
@@ -88,13 +56,6 @@ COPTFLAGS := -O3
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
CVECFLAGS :=
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := pnacl-ar
ARFLAGS := rcs

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -m64 -mcpu=power7 #-fopenmp -pg
CMISCFLAGS := -std=c99 -m64 -mcpu=power7
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -100,19 +67,9 @@ else
COPTFLAGS := -O3 -mtune=power7
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := -mvsx
endif
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -100,19 +67,9 @@ else
COPTFLAGS := -O2
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
endif
CVECFLAGS :=
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

View File

@@ -35,12 +35,6 @@
#ifndef BLIS_CONFIG_H
#define BLIS_CONFIG_H
// Enable multithreading via POSIX threads.
//#define BLIS_ENABLE_PTHREADS
// Enable multithreading via OpenMP.
#define BLIS_ENABLE_OPENMP
#endif

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -80,13 +47,11 @@ ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
ifneq ($(CC_VENDOR),gcc)
$(error gcc is required for this configuration.)
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -m64 -fopenmp # -fopenmp -pg
CMISCFLAGS := -std=c99 -m64
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -97,21 +62,24 @@ endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -march=native
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := -mavx -mfpmath=sse #-msse3 -march=native # -mfpmath=sse
COPTFLAGS := -O3
endif
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
ifeq ($(CC_VENDOR),gcc)
CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
else
ifeq ($(CC_VENDOR),icc)
CVECFLAGS := -xAVX
else
ifeq ($(CC_VENDOR),clang)
CVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# --- Determine the archiver and related flags ---
AR := ar
@@ -120,7 +88,7 @@ ARFLAGS := cru
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -lm -fopenmp -lpthread
LDFLAGS := -lm

View File

@@ -38,39 +38,6 @@ MAKE_DEFS_MK_INCLUDED := yes
#
# --- Build definitions --------------------------------------------------------
#
# Variables corresponding to other configure-time options.
BLIS_ENABLE_VERBOSE_MAKE_OUTPUT := no
BLIS_ENABLE_STATIC_BUILD := yes
BLIS_ENABLE_DYNAMIC_BUILD := no
#
# --- Utility program definitions ----------------------------------------------
#
SH := /bin/sh
MV := mv
MKDIR := mkdir -p
RM_F := rm -f
RM_RF := rm -rf
SYMLINK := ln -sf
FIND := find
GREP := grep
XARGS := xargs
RANLIB := ranlib
INSTALL := install -c
# Used to refresh CHANGELOG.
GIT := git
GIT_LOG := $(GIT) log --decorate
#
# --- Development tools definitions --------------------------------------------
#
@@ -86,7 +53,7 @@ endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall
@@ -100,19 +67,9 @@ else
COPTFLAGS := -O2
endif
ifneq ($(DEBUG_TYPE),noopt)
CVECFLAGS := #-msse3 -march=native # -mfpmath=sse
endif
CVECFLAGS := #-msse3 -march=core2 # -mfpmath=sse
CKOPTFLAGS := $(COPTFLAGS)
# Aggregate all of the flags into multiple groups: one for standard
# compilation, and one for each of the supported "special" compilation
# modes.
CFLAGS_NOOPT := $(CDBGFLAGS) $(CWARNFLAGS) $(CPICFLAGS) $(CMISCFLAGS) $(CPPROCFLAGS)
CFLAGS := $(COPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
CFLAGS_KERNELS := $(CKOPTFLAGS) $(CVECFLAGS) $(CFLAGS_NOOPT)
# --- Determine the archiver and related flags ---
AR := ar
ARFLAGS := cru

125
configure vendored
View File

@@ -73,6 +73,29 @@ print_usage()
echo " kept in the framework, otherwise optimization is"
echo " turned off."
echo " "
echo " --enable-verbose-make, --disable-verbose-make"
echo " "
echo " Enable (disabled by default) verbose compilation"
echo " output during make."
echo " "
echo " --disable-static, --enable-static"
echo " "
echo " Disable (enabled by default) building BLIS as a static"
echo " library. May be combined with --enable-shared."
echo " "
echo " --enable-shared, --disable-static"
echo " "
echo " Enable (disabled by default) building BLIS as a shared"
echo " library. May be combined with --enable-static."
echo " "
echo " -t MODEL, --enable-threading[=MODEL], --disable-threading"
echo " "
echo " Enable threading in the library, using threading model"
echo " MODEL={auto,omp,pthreads,no}. If MODEL=no or "
echo " --disable-threading is specified, threading will be"
echo " disabled. If MODEL=auto or is unspecified, a model"
echo " will be chosen automatically. The default is 'auto'."
echo " "
echo " -q, --quiet Suppress informational output. By default, configure"
echo " is verbose. (NOTE: -q is not yet implemented)"
echo " "
@@ -85,7 +108,7 @@ print_usage()
echo " Environment variables may also be specified as command line"
echo " options, e.g.:"
echo " "
echo " ./configure CC=gcc sandybridge"
echo " ./configure [options] CC=gcc sandybridge"
echo " "
echo " Note that not all compilers are compatible with a given"
echo " configuration."
@@ -164,8 +187,16 @@ main()
debug_type=''
debug_flag=''
# The threading flag.
threading_model='auto'
# Option variables.
quiet_flag=''
# Additional flags.
enable_verbose='no'
enable_static='yes'
enable_shared='no'
# The path to the auto-detection script.
auto_detect_sh="${build_dirpath}/auto-detect/auto-detect.sh"
@@ -190,7 +221,7 @@ main()
# Process our command line options.
while getopts ":hp:d:q-:" opt; do
while getopts ":hp:d:t:q-:" opt; do
case $opt in
-)
case "$OPTARG" in
@@ -212,6 +243,36 @@ main()
debug_flag=1
debug_type=${OPTARG#*=}
;;
disable-debug)
debug_flag=0
;;
enable-verbose-make)
enable_verbose='yes'
;;
disable-verbose-make)
enable_verbose='no'
;;
enable-static)
enable_static='yes'
;;
disable-static)
enable_static='no'
;;
enable-shared)
enable_shared='yes'
;;
disable-shared)
enable_shared='no'
;;
enable-threading)
threading_model='auto'
;;
enable-threading=*)
threading_model=${OPTARG#*=}
;;
disable-threading)
threading_model='no'
;;
*)
print_usage
;;
@@ -230,6 +291,9 @@ main()
q)
quiet_flag=1
;;
t)
threading_model=$OPTARG
;;
\?)
print_usage
;;
@@ -338,23 +402,45 @@ main()
debug_type='off'
echo "${script_name}: debug symbols disabled."
fi
# Check if the verbose make flag was specified.
if [ "x${enable_verbose}" = "xyes" ]; then
echo "${script_name}: enabling verbose make output, disable with 'make V=0'."
else
echo "${script_name}: disabling verbose make output, enable with 'make V=1'."
fi
# Check if the static lib flag was specified.
if [ "x${enable_static}" = "xyes" ]; then
echo "${script_name}: building BLIS as a static library."
fi
# Check if the shared lib flag was specified.
if [ "x${enable_shared}" = "xyes" ]; then
echo "${script_name}: building BLIS as a shared library."
fi
# Check if neither flag was specified.
if [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xno" ]; then
echo "Neither a shared nor static library build has been requested."
exit 1
fi
# Determine the compiler vendor if CC was specified.
if [ -n "$CC" ]; then
if $CC --version 2>/dev/null | grep -q 'pnacl-version'; then
cc_vendor='pnacl-clang'
else
cc_vendor=`$CC --version 2>/dev/null | grep -Eo 'icc|gcc|clang|emcc'`
fi
if [ -z "$cc_vendor" ]; then
cc_vendor=`$CC -qversion 2>/dev/null | grep -o 'IBM'`
fi
if [ -z "$cc_vendor" ]; then
echo Unable to determine compiler vendor.
exit 1
fi
cc_vendor=`echo $cc_vendor | { read first rest; echo $first; }`
# Check the threading model flag.
if [ "x${threading_model}" = "xauto" ]; then
echo "${script_name}: determining the threading model automatically."
elif [ "x${threading_model}" = "xomp" ]; then
echo "${script_name}: using OpenMP for threading."
elif [ "x${threading_model}" = "xpthreads" ]; then
echo "${script_name}: using Pthreads for threading."
elif [ "x${threading_model}" = "xno" ]; then
echo "${script_name}: threading is disabled."
else
echo "Unsupported threading model: ${threading_model}."
exit 1
fi
@@ -373,9 +459,12 @@ main()
| sed "s/@config_name@/${config_name}/g" \
| sed "s/@dist_path@/${dist_path_esc}/g" \
| sed "s/@CC@/${cc_esc}/g" \
| sed "s/@cc_vendor@/${cc_vendor}/g" \
| sed "s/@debug_type@/${debug_type}/g" \
| sed "s/@install_prefix@/${install_prefix_esc}/g" \
| sed "s/@enable_verbose@/${enable_verbose}/g" \
| sed "s/@enable_static@/${enable_static}/g" \
| sed "s/@enable_dynamic@/${enable_shared}/g" \
| sed "s/@threading_model@/${threading_model}/g" \
> "${config_mk_out_path}"

File diff suppressed because it is too large Load Diff

View File

@@ -34,6 +34,933 @@
#include "blis.h"
void bli_sgemm_8x8_FMA4(
dim_t k,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
auxinfo_t* data
)
{
dim_t k_iter = k / 4;
dim_t k_left = k % 4;
__asm__ volatile
(
" \n\t"
"movq %2, %%rax \n\t" // load address of a.
"movq %3, %%rbx \n\t" // load address of b.
" \n\t"
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
"vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
" \n\t"
"movq %6, %%rcx \n\t" // load address of c
"movq %8, %%rdi \n\t" // load cs_c
"leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float)
"leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c;
" \n\t"
"leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c;
"prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
"prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
"prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c
"prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c
"prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c
"prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c
"prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c
"prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c
" \n\t"
"vxorps %%ymm8, %%ymm8, %%ymm8 \n\t"
"vxorps %%ymm9, %%ymm9, %%ymm9 \n\t"
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t"
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t"
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t"
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t" // i = k_iter;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
" \n\t" // contains the k_left loop.
" \n\t"
".SLOOPKITER: \n\t" // MAIN LOOP
" \n\t"
" \n\t" // iteration 0
"prefetcht0 16 * 32(%%rax) \n\t"
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9 \n\t"
" \n\t"
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
" \n\t"
" \n\t" // iteration 1
"vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t"
"vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t"
" \n\t"
"vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t"
"vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t"
" \n\t"
" \n\t" // iteration 2
"prefetcht0 18 * 32(%%rax) \n\t"
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
"addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr)
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t"
" \n\t"
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
" \n\t"
" \n\t" // iteration 3
"vfmaddps %%ymm15, %%ymm1, %%ymm2, %%ymm15\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t"
"addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr)
"vfmaddps %%ymm13, %%ymm1, %%ymm3, %%ymm13\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm11, %%ymm1, %%ymm4, %%ymm11\n\t"
"vfmaddps %%ymm9, %%ymm1, %%ymm5, %%ymm9\n\t"
" \n\t"
"vfmaddps %%ymm14, %%ymm1, %%ymm2, %%ymm14\n\t"
"vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm12, %%ymm1, %%ymm3, %%ymm12\n\t"
"vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm10, %%ymm1, %%ymm4, %%ymm10\n\t"
"vfmaddps %%ymm8, %%ymm1, %%ymm5, %%ymm8\n\t"
" \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .SLOOPKITER \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".SCONSIDKLEFT: \n\t"
" \n\t"
"movq %1, %%rsi \n\t" // i = k_left;
"testq %%rsi, %%rsi \n\t" // check i via logical AND.
"je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
" \n\t" // else, we prepare to enter k_left loop.
" \n\t"
".SLOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"prefetcht0 16 * 32(%%rax) \n\t"
"vfmaddps %%ymm15, %%ymm0, %%ymm2, %%ymm15\n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
"vfmaddps %%ymm13, %%ymm0, %%ymm3, %%ymm13\n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
" \n\t"
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
"addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr)
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm11, %%ymm0, %%ymm4, %%ymm11\n\t"
"vfmaddps %%ymm9, %%ymm0, %%ymm5, %%ymm9\n\t"
" \n\t"
"vfmaddps %%ymm14, %%ymm0, %%ymm2, %%ymm14\n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
"addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr)
"vfmaddps %%ymm12, %%ymm0, %%ymm3, %%ymm12\n\t"
"vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
" \n\t"
"vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
"vfmaddps %%ymm10, %%ymm0, %%ymm4, %%ymm10\n\t"
"vfmaddps %%ymm8, %%ymm0, %%ymm5, %%ymm8\n\t"
"vmovaps %%ymm1, %%ymm0 \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t" // i -= 1;
"jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
" \n\t"
" \n\t"
".SPOSTACCUM: \n\t"
" \n\t" // ymm15: ymm13: ymm11: ymm9:
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
" \n\t" // ab10 ab12 ab14 ab16
" \n\t" // ab22 ab20 ab26 ab24
" \n\t" // ab32 ab30 ab36 ab34
" \n\t" // ab44 ab46 ab40 ab42
" \n\t" // ab54 ab56 ab50 ab52
" \n\t" // ab66 ab64 ab62 ab60
" \n\t" // ab76 ) ab74 ) ab72 ) ab70 )
" \n\t"
" \n\t" // ymm14: ymm12: ymm10: ymm8:
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
" \n\t" // ab11 ab13 ab15 ab17
" \n\t" // ab23 ab21 ab27 ab25
" \n\t" // ab33 ab31 ab37 ab35
" \n\t" // ab45 ab47 ab41 ab43
" \n\t" // ab55 ab57 ab51 ab53
" \n\t" // ab67 ab65 ab63 ab61
" \n\t" // ab77 ) ab75 ) ab73 ) ab71 )
"vmovaps %%ymm15, %%ymm7 \n\t"
"vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t"
"vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t"
" \n\t"
"vmovaps %%ymm11, %%ymm7 \n\t"
"vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t"
"vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t"
" \n\t"
"vmovaps %%ymm14, %%ymm7 \n\t"
"vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t"
"vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t"
" \n\t"
"vmovaps %%ymm10, %%ymm7 \n\t"
"vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t"
"vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t"
" \n\t" // ymm15: ymm13: ymm11: ymm9:
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
" \n\t" // ab10 ab12 ab14 ab16
" \n\t" // ab20 ab22 ab24 ab26
" \n\t" // ab30 ab32 ab34 ab36
" \n\t" // ab44 ab46 ab40 ab42
" \n\t" // ab54 ab56 ab50 ab52
" \n\t" // ab64 ab66 ab60 ab62
" \n\t" // ab74 ) ab76 ) ab70 ) ab72 )
" \n\t"
" \n\t" // ymm14: ymm12: ymm10: ymm8:
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
" \n\t" // ab11 ab13 ab15 ab17
" \n\t" // ab21 ab23 ab25 ab27
" \n\t" // ab31 ab33 ab35 ab37
" \n\t" // ab45 ab47 ab41 ab43
" \n\t" // ab55 ab57 ab51 ab53
" \n\t" // ab65 ab67 ab61 ab63
" \n\t" // ab75 ) ab77 ) ab71 ) ab73 )
"vmovaps %%ymm15, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t"
"vperm2f128 $0x12, %%ymm11, %%ymm7, %%ymm11 \n\t"
" \n\t"
"vmovaps %%ymm13, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm9, %%ymm13, %%ymm13 \n\t"
"vperm2f128 $0x12, %%ymm9, %%ymm7, %%ymm9 \n\t"
" \n\t"
"vmovaps %%ymm14, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t"
"vperm2f128 $0x12, %%ymm10, %%ymm7, %%ymm10 \n\t"
" \n\t"
"vmovaps %%ymm12, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm8, %%ymm12, %%ymm12 \n\t"
"vperm2f128 $0x12, %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t" // ymm15: ymm13: ymm11: ymm9:
" \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
" \n\t" // ab10 ab12 ab14 ab16
" \n\t" // ab20 ab22 ab24 ab26
" \n\t" // ab30 ab32 ab34 ab36
" \n\t" // ab40 ab42 ab44 ab46
" \n\t" // ab50 ab52 ab54 ab56
" \n\t" // ab60 ab62 ab64 ab66
" \n\t" // ab70 ) ab72 ) ab74 ) ab76 )
" \n\t"
" \n\t" // ymm14: ymm12: ymm10: ymm8:
" \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
" \n\t" // ab11 ab13 ab15 ab17
" \n\t" // ab21 ab23 ab25 ab27
" \n\t" // ab31 ab33 ab35 ab37
" \n\t" // ab41 ab43 ab45 ab47
" \n\t" // ab51 ab53 ab55 ab57
" \n\t" // ab61 ab63 ab65 ab67
" \n\t" // ab71 ) ab73 ) ab75 ) ab77 )
" \n\t"
"movq %4, %%rax \n\t" // load address of alpha
"movq %5, %%rbx \n\t" // load address of beta
"vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate
"vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate
" \n\t"
"vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha
"vmulps %%ymm0, %%ymm9, %%ymm9 \n\t"
"vmulps %%ymm0, %%ymm10, %%ymm10 \n\t"
"vmulps %%ymm0, %%ymm11, %%ymm11 \n\t"
"vmulps %%ymm0, %%ymm12, %%ymm12 \n\t"
"vmulps %%ymm0, %%ymm13, %%ymm13 \n\t"
"vmulps %%ymm0, %%ymm14, %%ymm14 \n\t"
"vmulps %%ymm0, %%ymm15, %%ymm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %7, %%rsi \n\t" // load rs_c
"leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float)
" \n\t"
"leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
" \n\t"
"leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c;
"leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c;
" \n\t"
" \n\t"
" \n\t" // determine if
" \n\t" // c % 32 == 0, AND
" \n\t" // 4*cs_c % 32 == 0, AND
" \n\t" // rs_c == 1
" \n\t" // ie: aligned, ldim aligned, and
" \n\t" // column-stored
" \n\t"
"cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4.
"sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
"testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
"setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
"testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero.
"setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
" \n\t" // and(bl,bh) followed by
" \n\t" // and(bh,al) will reveal result
" \n\t"
" \n\t" // now avoid loading C if beta == 0
" \n\t"
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
"vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0.
"je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
" \n\t"
" \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .SCOLSTORED \n\t" // jump to column storage case
" \n\t"
" \n\t"
".SGENSTORED: \n\t"
" \n\t" // update c00:c70
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vfmaddps %%ymm15, %%ymm0, %%ymm4, %%ymm0\n\t" // scale by beta and add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t" // update c01:c71
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c02:c72
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c03:c73
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c04:c74
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c05:c75
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c06:c76
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c07:c77
"vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
"vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
"vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
"vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
"vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
"vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
"vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
"vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
" \n\t"
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result,
" \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
" \n\t"
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".SCOLSTORED: \n\t"
" \n\t"
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70,
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result,
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71,
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
"vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result,
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72,
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73,
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
"vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result,
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74,
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75,
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
"vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result,
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76,
"vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
"vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
"vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77,
"vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
"vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result,
"vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
" \n\t"
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".SBETAZERO: \n\t"
" \n\t" // check if aligned/column-stored
"andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
"andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
"jne .SCOLSTORBZ \n\t" // jump to column storage case
" \n\t"
" \n\t"
" \n\t"
".SGENSTORBZ: \n\t"
" \n\t"
" \n\t" // update c00:c70
"vmovapd %%ymm15, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c01:c71
"vmovapd %%ymm14, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c02:c72
"vmovapd %%ymm13, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c03:c73
"vmovapd %%ymm12, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c04:c74
"vmovapd %%ymm11, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c05:c75
"vmovapd %%ymm10, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c06:c76
"vmovapd %%ymm9, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
"addq %%rdi, %%rcx \n\t" // c += cs_c;
"addq %%rdi, %%rdx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t" // update c07:c77
"vmovapd %%ymm8, %%ymm0 \n\t"
"vextractf128 $1, %%ymm0, %%xmm2 \n\t"
"vmovss %%xmm0, (%%rcx) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
"vmovss %%xmm0, (%%rcx,%%r12) \n\t"
"vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
"vmovss %%xmm1, (%%rcx,%%r13) \n\t"
"vmovss %%xmm2, (%%rdx) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
"vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
"vmovss %%xmm2, (%%rdx,%%r12) \n\t"
"vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
"vmovss %%xmm3, (%%rdx,%%r13) \n\t"
" \n\t"
" \n\t"
"jmp .SDONE \n\t" // jump to end.
" \n\t"
" \n\t"
" \n\t"
".SCOLSTORBZ: \n\t"
" \n\t"
" \n\t"
"vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory.
"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
"vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory.
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".SDONE: \n\t"
" \n\t"
: // output operands (none)
: // input operands
"m" (k_iter), // 0
"m" (k_left), // 1
"m" (a), // 2
"m" (b), // 3
"m" (alpha), // 4
"m" (beta), // 5
"m" (c), // 6
"m" (rs_c), // 7
"m" (cs_c)/*, // 8
"m" (b_next), // 9
"m" (a_next)*/ // 10
: // register clobber list
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
}
#undef KERNEL4x6_1
#undef KERNEL4x6_2
#undef KERNEL4x6_3