Updates to haswell gemm micro-kernels.

Details:
- Added two new sets of [sd]gemm micro-kernels for haswell architectures,
  one that is 4x24/4x12 (s and d) and one that is 6x16/6x8.
- Changed the haswell configuration to use the 6x16/6x8 micro-kernels
  by default.
- Updated various Makefiles, in test, test/3m4m, and testsuite.
This commit is contained in:
Field G. Van Zee
2016-05-04 17:22:56 -05:00
parent 0b01d355ae
commit c3a4d39d03
8 changed files with 2818 additions and 121 deletions

View File

@@ -60,12 +60,23 @@
#else
/*
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 6
*/
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
@@ -80,12 +91,24 @@
#else
/*
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 6
*/
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif

View File

@@ -322,9 +322,11 @@ void PASTEMAC(ch,varname) \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c", MR, NR, c11, rs_c, cs_c, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -124,7 +124,7 @@ void bli_sgemm_asm_16x6
" \n\t"
" \n\t"
" \n\t" // iteration 0
"prefetcht0 16 * 32(%%rax) \n\t"
"prefetcht0 128 * 4(%%rax) \n\t"
" \n\t"
"vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t"
"vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t"
@@ -150,8 +150,6 @@ void bli_sgemm_asm_16x6
"vmovaps -2 * 32(%%rax), %%ymm0 \n\t"
"vmovaps -1 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
"vbroadcastss 6 * 4(%%rbx), %%ymm2 \n\t"
"vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t"
@@ -177,10 +175,8 @@ void bli_sgemm_asm_16x6
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
"prefetcht0 20 * 32(%%rax) \n\t"
"prefetcht0 160 * 4(%%rax) \n\t"
" \n\t"
"vbroadcastss 12 * 4(%%rbx), %%ymm2 \n\t"
"vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t"
@@ -206,8 +202,6 @@ void bli_sgemm_asm_16x6
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
"vbroadcastss 18 * 4(%%rbx), %%ymm2 \n\t"
"vbroadcastss 19 * 4(%%rbx), %%ymm3 \n\t"
@@ -255,7 +249,7 @@ void bli_sgemm_asm_16x6
" \n\t"
".SLOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"prefetcht0 16 * 32(%%rax) \n\t"
"prefetcht0 128 * 4(%%rax) \n\t"
" \n\t"
"vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t"
"vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t"
@@ -395,6 +389,7 @@ void bli_sgemm_asm_16x6
SGEMM_INPUT_GS_BETA_NZ
"vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t"
SGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
"movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c
@@ -433,6 +428,7 @@ void bli_sgemm_asm_16x6
SGEMM_INPUT_GS_BETA_NZ
"vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t"
SGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t"
@@ -496,11 +492,11 @@ void bli_sgemm_asm_16x6
"vmovaps (%%rcx), %%ymm0 \n\t"
"vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t"
"vmovaps %%ymm0, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps (%%rdx), %%ymm1 \n\t"
"vfmadd213ps %%ymm15, %%ymm3, %%ymm1 \n\t"
"vmovaps %%ymm1, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"
@@ -546,6 +542,7 @@ void bli_sgemm_asm_16x6
" \n\t"
"vmovaps %%ymm14, %%ymm0 \n\t"
SGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
"movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c
@@ -578,6 +575,7 @@ void bli_sgemm_asm_16x6
" \n\t"
"vmovaps %%ymm15, %%ymm0 \n\t"
SGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t"
@@ -618,8 +616,9 @@ void bli_sgemm_asm_16x6
" \n\t"
" \n\t"
"vmovaps %%ymm14, (%%rcx) \n\t"
" \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"
@@ -737,7 +736,7 @@ void bli_dgemm_asm_8x6
" \n\t"
" \n\t"
" \n\t" // iteration 0
"prefetcht0 16 * 32(%%rax) \n\t"
"prefetcht0 64 * 8(%%rax) \n\t"
" \n\t"
"vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t"
"vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t"
@@ -763,8 +762,6 @@ void bli_dgemm_asm_8x6
"vmovaps -2 * 32(%%rax), %%ymm0 \n\t"
"vmovaps -1 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 1
"vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t"
"vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t"
@@ -790,10 +787,8 @@ void bli_dgemm_asm_8x6
"vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
"vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 2
"prefetcht0 20 * 32(%%rax) \n\t"
"prefetcht0 76 * 8(%%rax) \n\t"
" \n\t"
"vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t"
"vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t"
@@ -819,8 +814,6 @@ void bli_dgemm_asm_8x6
"vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
"vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t" // iteration 3
"vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t"
"vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t"
@@ -868,7 +861,7 @@ void bli_dgemm_asm_8x6
" \n\t"
".DLOOPKLEFT: \n\t" // EDGE LOOP
" \n\t"
"prefetcht0 16 * 32(%%rax) \n\t"
"prefetcht0 64 * 8(%%rax) \n\t"
" \n\t"
"vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t"
"vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t"
@@ -1008,6 +1001,7 @@ void bli_dgemm_asm_8x6
DGEMM_INPUT_GS_BETA_NZ
"vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t"
DGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
"movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c
@@ -1046,6 +1040,7 @@ void bli_dgemm_asm_8x6
DGEMM_INPUT_GS_BETA_NZ
"vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t"
DGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t"
@@ -1159,6 +1154,7 @@ void bli_dgemm_asm_8x6
" \n\t"
"vmovaps %%ymm14, %%ymm0 \n\t"
DGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
"movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c
@@ -1191,6 +1187,7 @@ void bli_dgemm_asm_8x6
" \n\t"
"vmovaps %%ymm15, %%ymm0 \n\t"
DGEMM_OUTPUT_GS_BETA_NZ
//"addq %%rdi, %%rcx \n\t" // c += cs_c;
" \n\t"
" \n\t"
" \n\t"
@@ -1231,8 +1228,9 @@ void bli_dgemm_asm_8x6
" \n\t"
" \n\t"
"vmovaps %%ymm14, (%%rcx) \n\t"
" \n\t"
//"addq %%rdi, %%rcx \n\t"
"vmovaps %%ymm15, (%%rdx) \n\t"
//"addq %%rdi, %%rdx \n\t"
" \n\t"
" \n\t"
" \n\t"

View File

@@ -55,15 +55,11 @@
# --- Makefile initialization --------------------------------------------------
#
# Define the name of the configuration file.
CONFIG_MK_FILE := config.mk
# Define the name of the common makefile fragment.
COMMON_MK_FILE := common.mk
# Define the name of the file containing build and architecture-specific
# makefile definitions.
MAKE_DEFS_FILE := make_defs.mk
# Locations of important files.
ROOT_PATH := ../..
# Important locations and directory names.
RELPATH := ../..
CONFIG_DIR := config
@@ -74,43 +70,26 @@ CONFIG_DIR := config
# Construct the path to the makefile configuration file that was generated by
# the configure script.
CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE)
COMMON_MK_PATH := $(RELPATH)/$(COMMON_MK_FILE)
# Include the configuration file.
-include $(CONFIG_MK_PATH)
# Include the common makefile fragment.
-include $(COMMON_MK_PATH)
# Detect whether we actually got the configuration file. If we didn't, then
# it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes)
CONFIG_MK_PRESENT := yes
# Detect whether we actually got the common makefile fragment. If we didn't,
# then it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
COMMON_MK_PRESENT := yes
else
CONFIG_MK_PRESENT := no
COMMON_MK_PRESENT := no
endif
# Override the DIST_PATH value obtained from config.mk, since it is relative
# to the build directory.
DIST_PATH := ..
# Now we have access to CONFIG_NAME, which tells us which sub-directory of the
# config directory to use as our configuration.
CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
#
# --- Include makefile definitions file ----------------------------------------
#
# Construct the path to the makefile definitions file residing inside of
# the configuration sub-directory.
MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE)
# Include the makefile definitions file.
-include $(MAKE_DEFS_MK_PATH)
# Detect whether we actually got the make definitios file. If we didn't, then
# it is likely that the configuration is invalid (or incomplete).
ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes)
MAKE_DEFS_MK_PRESENT := yes
else
MAKE_DEFS_MK_PRESENT := no
endif
CONFIG_PATH := $(RELPATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
@@ -127,7 +106,8 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
# BLAS library path(s). This is where the BLAS libraries reside.
HOME_LIB_PATH := $(HOME)/flame/lib
MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib
ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib
@@ -142,9 +122,10 @@ ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_sequential \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_intel_ilp64
-lmkl_sequential \
-lpthread -lm -ldl
#MKLP_LIB := -L$(MKL_LIB_PATH) \
# -lmkl_intel_thread \
# -lmkl_core \
@@ -152,11 +133,12 @@ MKL_LIB := -L$(MKL_LIB_PATH) \
# -L$(ICC_LIB_PATH) \
# -liomp5
MKLP_LIB := -L$(MKL_LIB_PATH) \
-lmkl_gnu_thread \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_intel_ilp64 \
-L$(ICC_LIB_PATH) \
-lgomp
-lmkl_gnu_thread \
-lpthread -lm -ldl
#-L$(ICC_LIB_PATH) \
#-lgomp
# ACML
ACML_LIB := -L$(ACML_LIB_PATH) \
@@ -227,9 +209,9 @@ STR_ST := -DTHR_STR=\"st\"
STR_MT := -DTHR_STR=\"mt\"
# Problem size specification
PDEF_ST := -DP_BEGIN=40 \
PDEF_ST := -DP_BEGIN=80 \
-DP_END=2000 \
-DP_INC=40
-DP_INC=80
PDEF_MT := -DP_BEGIN=80 \
-DP_END=4000 \

View File

@@ -54,15 +54,11 @@
# --- Makefile initialization --------------------------------------------------
#
# Define the name of the configuration file.
CONFIG_MK_FILE := config.mk
# Define the name of the common makefile fragment.
COMMON_MK_FILE := common.mk
# Define the name of the file containing build and architecture-specific
# makefile definitions.
MAKE_DEFS_FILE := make_defs.mk
# Locations of important files.
ROOT_PATH := ..
# Important locations and directory names.
RELPATH := ..
CONFIG_DIR := config
@@ -73,43 +69,26 @@ CONFIG_DIR := config
# Construct the path to the makefile configuration file that was generated by
# the configure script.
CONFIG_MK_PATH := $(ROOT_PATH)/$(CONFIG_MK_FILE)
COMMON_MK_PATH := $(RELPATH)/$(COMMON_MK_FILE)
# Include the configuration file.
-include $(CONFIG_MK_PATH)
# Include the common makefile fragment.
-include $(COMMON_MK_PATH)
# Detect whether we actually got the configuration file. If we didn't, then
# it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(CONFIG_MK_INCLUDED)),yes)
CONFIG_MK_PRESENT := yes
# Detect whether we actually got the common makefile fragment. If we didn't,
# then it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
COMMON_MK_PRESENT := yes
else
CONFIG_MK_PRESENT := no
COMMON_MK_PRESENT := no
endif
# Override the DIST_PATH value obtained from config.mk, since it is relative
# to the build directory.
DIST_PATH := ..
# Now we have access to CONFIG_NAME, which tells us which sub-directory of the
# config directory to use as our configuration.
CONFIG_PATH := $(ROOT_PATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
#
# --- Include makefile definitions file ----------------------------------------
#
# Construct the path to the makefile definitions file residing inside of
# the configuration sub-directory.
MAKE_DEFS_MK_PATH := $(CONFIG_PATH)/$(MAKE_DEFS_FILE)
# Include the makefile definitions file.
-include $(MAKE_DEFS_MK_PATH)
# Detect whether we actually got the make definitios file. If we didn't, then
# it is likely that the configuration is invalid (or incomplete).
ifeq ($(strip $(MAKE_DEFS_MK_INCLUDED)),yes)
MAKE_DEFS_MK_PRESENT := yes
else
MAKE_DEFS_MK_PRESENT := no
endif
CONFIG_PATH := $(RELPATH)/$(CONFIG_DIR)/$(CONFIG_NAME)
@@ -126,7 +105,8 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
# BLAS library path(s). This is where the BLAS libraries reside.
BLAS_LIB_PATH := $(HOME)/flame/lib
MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
ESSL_LIB_PATH := $(HOME)/path/to/essl/changeme
# OpenBLAS
@@ -138,9 +118,10 @@ ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
-lmkl_sequential \
-lmkl_intel_lp64 \
-lmkl_core \
-lmkl_intel_lp64
-lmkl_sequential \
-lpthread -lm -ldl
# ESSL
# Note: ESSL is named differently for SMP and/or BG
@@ -188,7 +169,7 @@ LDFLAGS += -lgfortran -lm -lpthread -fopenmp
# blis openblas atlas mkl mac essl
#
#all: blis openblas atlas mkl
all: blis openblas
all: blis openblas mkl
blis: test_gemv_blis.x \
test_ger_blis.x \

View File

@@ -46,7 +46,7 @@
.PHONY: all bin clean \
check-env check-env-mk check-env-fragments check-env-make-defs \
run run-amd64 run-x86 run-arm
run run-amd64 run-x86 run-arm
@@ -54,15 +54,15 @@
# --- Makefile initialization --------------------------------------------------
#
RELPATH := ..
# Define the name of the common makefile.
# Define the name of the common makefile fragment.
COMMON_MK_FILE := common.mk
# All makefile fragments in the tree will have this name.
FRAGMENT_MK := .fragment.mk
# Locations of important files.
# Important locations and directory names.
RELPATH := ..
CONFIG_DIR := config
FRAME_DIR := frame
LIB_DIR := lib
@@ -77,11 +77,11 @@ LIB_DIR := lib
# the configure script.
COMMON_MK_PATH := ../$(COMMON_MK_FILE)
# Include the configuration file.
# Include the common makefile fragment.
-include $(COMMON_MK_PATH)
# Detect whether we actually got the configuration file. If we didn't, then
# it is likely that the user has not yet generated it (via configure).
# Detect whether we actually got the common makefile fragment. If we didn't,
# then it is likely that the user has not yet generated it (via configure).
ifeq ($(strip $(COMMON_MK_INCLUDED)),yes)
COMMON_MK_PRESENT := yes
else