Overhauled test/3m4m Makefile and scripts.

Details:
- Rewrote much of Makefile to generate executables for single- and dual-
  socket multithreading as well as single-threaded. Each of the three
  can also use a different problem size range/increment, as is often
  appropriate when doubling/halving the number of threads.
- Rewrote runme.sh script to flexibly execute as many threading
  parameter scenarios as is given in the input parameter string
  (currently set within the script itself). The string also encodes
  the maximum problem size for each threading scenario, which is used
  to identify the executable to run. Also improved the "progress" output
  of the script to reduce redundant info and improve readability in
  terminals that are not especially wide.
- Minor updates to test_*.c source files.
- Updated matlab scripts according to changes made to the Makefile,
  test drivers, and runme.sh script, and renamed 'plot_all.m' to
  'runme.m'.
This commit is contained in:
Field G. Van Zee
2019-03-05 17:47:55 -06:00
parent 3bdab823fa
commit 9f1dbe572b
11 changed files with 727 additions and 1014 deletions

View File

@@ -46,8 +46,6 @@
#
.PHONY: all \
blis-gemm-st openblas-gemm-st mkl-gemm-st acml-gemm-st \
blis-gemm-mt openblas-gemm-mt mkl-gemm-mt acml-gemm-mt \
clean cleanx
@@ -94,21 +92,19 @@ endif
#BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a
# BLAS library path(s). This is where the BLAS libraries reside.
HOME_LIB_PATH := $(HOME)/flame/lib
#MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
#MKL_LIB_PATH := ${MKLROOT}/lib/intel64
HOME_LIB_PATH := $(HOME)/flame/lib
#VENDOR_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64
MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64
#VENDOR_LIB_PATH := ${MKLROOT}/lib/intel64
#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64
ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib
ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib
# OpenBLAS
OPENBLAS_LIB := $(HOME_LIB_PATH)/libopenblas.a
OPENBLASP_LIB := $(HOME_LIB_PATH)/libopenblasp.a
# ATLAS
ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
$(HOME_LIB_PATH)/libatlas.a
#ATLAS_LIB := $(HOME_LIB_PATH)/libf77blas.a \
# $(HOME_LIB_PATH)/libatlas.a
# MKL
MKL_LIB := -L$(MKL_LIB_PATH) \
@@ -130,13 +126,29 @@ MKLP_LIB := -L$(MKL_LIB_PATH) \
#-L$(ICC_LIB_PATH) \
#-lgomp
# ACML
ACML_LIB := -L$(ACML_LIB_PATH) \
-lgfortran -lm -lrt -ldl -lacml
ACMLP_LIB := -L$(ACMLP_LIB_PATH) \
-lgfortran -lm -lrt -ldl -lacml_mp
VENDOR_LIB := $(MKL_LIB)
VENDORP_LIB := $(MKLP_LIB)
#
# --- Problem size definitions -------------------------------------------------
#
# Single core (single-threaded)
PS_BEGIN := 40
PS_MAX := 2000
PS_INC := 40
# Single-socket (multithreaded)
P1_BEGIN := 120
P1_MAX := 6000
P1_INC := 120
# Dual-socket (multithreaded)
P2_BEGIN := 160
P2_MAX := 8000
P2_INC := 160
#
# --- General build definitions ------------------------------------------------
@@ -165,12 +177,6 @@ CFLAGS += -I$(TEST_SRC_PATH)
LIBBLIS_LINK := $(LIB_PATH)/$(LIBBLIS_L)
# Datatype
DT_S := -DDT=BLIS_FLOAT
DT_D := -DDT=BLIS_DOUBLE
DT_C := -DDT=BLIS_SCOMPLEX
DT_Z := -DDT=BLIS_DCOMPLEX
# Which library?
BLI_DEF := -DBLIS
BLA_DEF := -DBLAS
@@ -185,29 +191,25 @@ D1M := -DIND=BLIS_1M
DNAT := -DIND=BLIS_NAT
# Implementation string
STR_3MHW := -DSTR=\"3mhw\"
STR_3M1 := -DSTR=\"3m1\"
STR_4MHW := -DSTR=\"4mhw\"
STR_4M1B := -DSTR=\"4m1b\"
STR_4M1A := -DSTR=\"4m1a\"
STR_1M := -DSTR=\"1m\"
STR_NAT := -DSTR=\"asm\"
#STR_3MHW := -DSTR=\"3mhw\"
#STR_3M1 := -DSTR=\"3m1\"
#STR_4MHW := -DSTR=\"4mhw\"
#STR_4M1B := -DSTR=\"4m1b\"
#STR_4M1A := -DSTR=\"4m1a\"
#STR_1M := -DSTR=\"1m\"
STR_NAT := -DSTR=\"asm_blis\"
STR_OBL := -DSTR=\"openblas\"
STR_MKL := -DSTR=\"mkl\"
STR_ACML := -DSTR=\"acml\"
STR_VEN := -DSTR=\"vendor\"
# Single or multithreaded string
STR_ST := -DTHR_STR=\"st\"
STR_MT := -DTHR_STR=\"mt\"
STR_1S := -DTHR_STR=\"1s\"
STR_2S := -DTHR_STR=\"2s\"
# Problem size specification
PDEF_ST := -DP_BEGIN=56 \
-DP_END=2800 \
-DP_INC=56
PDEF_MT := -DP_BEGIN=160 \
-DP_END=8000 \
-DP_INC=160
PDEF_ST := -DP_BEGIN=$(PS_BEGIN) -DP_INC=$(PS_INC) -DP_MAX=$(PS_MAX)
PDEF_1S := -DP_BEGIN=$(P1_BEGIN) -DP_INC=$(P1_INC) -DP_MAX=$(P1_MAX)
PDEF_2S := -DP_BEGIN=$(P2_BEGIN) -DP_INC=$(P2_INC) -DP_MAX=$(P2_MAX)
@@ -215,340 +217,129 @@ PDEF_MT := -DP_BEGIN=160 \
# --- Targets/rules ------------------------------------------------------------
#
all: all-st all-mt
blis: blis-st blis-mt
openblas: openblas-st openblas-mt
mkl: mkl-st mkl-mt
all: all-st all-1s all-2s
blis: blis-st blis-1s blis-2s
openblas: openblas-st openblas-1s openblas-2s
vendor: vendor-st vendor-1s vendor-2s
mkl: vendor
armpl: vendor
all-st: blis-st openblas-st mkl-st
all-mt: blis-mt openblas-mt mkl-mt
all-1s: blis-1s openblas-1s mkl-1s
all-2s: blis-2s openblas-2s mkl-2s
blis-st: blis-nat-st
blis-mt: blis-nat-mt
blis-1s: blis-nat-1s
blis-2s: blis-nat-2s
blis-ind: blis-ind-st blis-ind-mt
blis-nat: blis-nat-st blis-nat-mt
#blis-ind: blis-ind-st blis-ind-mt
blis-nat: blis-nat-st blis-nat-1s blis-nat-2s
blis-ind-st: \
test_cgemm_3mhw_blis_st.x \
test_zgemm_3mhw_blis_st.x \
test_cgemm_3m1_blis_st.x \
test_zgemm_3m1_blis_st.x \
test_cgemm_4mhw_blis_st.x \
test_zgemm_4mhw_blis_st.x \
test_cgemm_4m1b_blis_st.x \
test_zgemm_4m1b_blis_st.x \
test_cgemm_4m1a_blis_st.x \
test_zgemm_4m1a_blis_st.x \
test_cgemm_1m_blis_st.x \
test_zgemm_1m_blis_st.x
# Define the datatypes, operations, and implementations.
DTS := s d c z
OPS := gemm hemm herk trmm trsm
IMPLS := asm_blis openblas vendor
blis-ind-mt: \
test_cgemm_3mhw_blis_mt.x \
test_zgemm_3mhw_blis_mt.x \
test_cgemm_3m1_blis_mt.x \
test_zgemm_3m1_blis_mt.x \
test_cgemm_4mhw_blis_mt.x \
test_zgemm_4mhw_blis_mt.x \
test_cgemm_4m1b_blis_mt.x \
test_zgemm_4m1b_blis_mt.x \
test_cgemm_4m1a_blis_mt.x \
test_zgemm_4m1a_blis_mt.x \
test_cgemm_1m_blis_mt.x \
test_zgemm_1m_blis_mt.x
# Define functions to construct object filenames from the datatypes and
# operations given an implementation. We define one function for single-
# threaded, single-socket, and dual-socket filenames.
get-st-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(PS_MAX)_$(1)_st.o))
get-1s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P1_MAX)_$(1)_1s.o))
get-2s-objs = $(foreach dt,$(DTS),$(foreach op,$(OPS),test_$(dt)$(op)_$(P2_MAX)_$(1)_2s.o))
blis-nat-st: \
test_sgemm_asm_blis_st.x \
test_dgemm_asm_blis_st.x \
test_cgemm_asm_blis_st.x \
test_zgemm_asm_blis_st.x \
test_shemm_asm_blis_st.x \
test_dhemm_asm_blis_st.x \
test_chemm_asm_blis_st.x \
test_zhemm_asm_blis_st.x \
test_sherk_asm_blis_st.x \
test_dherk_asm_blis_st.x \
test_cherk_asm_blis_st.x \
test_zherk_asm_blis_st.x \
test_strmm_asm_blis_st.x \
test_dtrmm_asm_blis_st.x \
test_ctrmm_asm_blis_st.x \
test_ztrmm_asm_blis_st.x \
test_strsm_asm_blis_st.x \
test_dtrsm_asm_blis_st.x \
test_ctrsm_asm_blis_st.x \
test_ztrsm_asm_blis_st.x
# Construct object and binary names for single-threaded, single-socket, and
# dual-socket files for BLIS, OpenBLAS, and a vendor library (e.g. MKL).
BLIS_NAT_ST_OBJS := $(call get-st-objs,asm_blis)
BLIS_NAT_ST_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_ST_OBJS))
BLIS_NAT_1S_OBJS := $(call get-1s-objs,asm_blis)
BLIS_NAT_1S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_1S_OBJS))
BLIS_NAT_2S_OBJS := $(call get-2s-objs,asm_blis)
BLIS_NAT_2S_BINS := $(patsubst %.o,%.x,$(BLIS_NAT_2S_OBJS))
blis-nat-mt: \
test_sgemm_asm_blis_mt.x \
test_dgemm_asm_blis_mt.x \
test_cgemm_asm_blis_mt.x \
test_zgemm_asm_blis_mt.x \
test_shemm_asm_blis_mt.x \
test_dhemm_asm_blis_mt.x \
test_chemm_asm_blis_mt.x \
test_zhemm_asm_blis_mt.x \
test_sherk_asm_blis_mt.x \
test_dherk_asm_blis_mt.x \
test_cherk_asm_blis_mt.x \
test_zherk_asm_blis_mt.x \
test_strmm_asm_blis_mt.x \
test_dtrmm_asm_blis_mt.x \
test_ctrmm_asm_blis_mt.x \
test_ztrmm_asm_blis_mt.x \
test_strsm_asm_blis_mt.x \
test_dtrsm_asm_blis_mt.x \
test_ctrsm_asm_blis_mt.x \
test_ztrsm_asm_blis_mt.x
OPENBLAS_ST_OBJS := $(call get-st-objs,openblas)
OPENBLAS_ST_BINS := $(patsubst %.o,%.x,$(OPENBLAS_ST_OBJS))
OPENBLAS_1S_OBJS := $(call get-1s-objs,openblas)
OPENBLAS_1S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_1S_OBJS))
OPENBLAS_2S_OBJS := $(call get-2s-objs,openblas)
OPENBLAS_2S_BINS := $(patsubst %.o,%.x,$(OPENBLAS_2S_OBJS))
openblas-st: \
test_sgemm_openblas_st.x \
test_dgemm_openblas_st.x \
test_cgemm_openblas_st.x \
test_zgemm_openblas_st.x \
test_shemm_openblas_st.x \
test_dhemm_openblas_st.x \
test_chemm_openblas_st.x \
test_zhemm_openblas_st.x \
test_sherk_openblas_st.x \
test_dherk_openblas_st.x \
test_cherk_openblas_st.x \
test_zherk_openblas_st.x \
test_strmm_openblas_st.x \
test_dtrmm_openblas_st.x \
test_ctrmm_openblas_st.x \
test_ztrmm_openblas_st.x \
test_strsm_openblas_st.x \
test_dtrsm_openblas_st.x \
test_ctrsm_openblas_st.x \
test_ztrsm_openblas_st.x
VENDOR_ST_OBJS := $(call get-st-objs,vendor)
VENDOR_ST_BINS := $(patsubst %.o,%.x,$(VENDOR_ST_OBJS))
VENDOR_1S_OBJS := $(call get-1s-objs,vendor)
VENDOR_1S_BINS := $(patsubst %.o,%.x,$(VENDOR_1S_OBJS))
VENDOR_2S_OBJS := $(call get-2s-objs,vendor)
VENDOR_2S_BINS := $(patsubst %.o,%.x,$(VENDOR_2S_OBJS))
openblas-mt: \
test_sgemm_openblas_mt.x \
test_dgemm_openblas_mt.x \
test_cgemm_openblas_mt.x \
test_zgemm_openblas_mt.x \
test_shemm_openblas_mt.x \
test_dhemm_openblas_mt.x \
test_chemm_openblas_mt.x \
test_zhemm_openblas_mt.x \
test_sherk_openblas_mt.x \
test_dherk_openblas_mt.x \
test_cherk_openblas_mt.x \
test_zherk_openblas_mt.x \
test_strmm_openblas_mt.x \
test_dtrmm_openblas_mt.x \
test_ctrmm_openblas_mt.x \
test_ztrmm_openblas_mt.x \
test_strsm_openblas_mt.x \
test_dtrsm_openblas_mt.x \
test_ctrsm_openblas_mt.x \
test_ztrsm_openblas_mt.x
# Define some targets associated with the above object/binary files.
blis-nat-st: $(BLIS_NAT_ST_BINS)
blis-nat-1s: $(BLIS_NAT_1S_BINS)
blis-nat-2s: $(BLIS_NAT_2S_BINS)
mkl-st: \
test_sgemm_mkl_st.x \
test_dgemm_mkl_st.x \
test_cgemm_mkl_st.x \
test_zgemm_mkl_st.x \
test_shemm_mkl_st.x \
test_dhemm_mkl_st.x \
test_chemm_mkl_st.x \
test_zhemm_mkl_st.x \
test_sherk_mkl_st.x \
test_dherk_mkl_st.x \
test_cherk_mkl_st.x \
test_zherk_mkl_st.x \
test_strmm_mkl_st.x \
test_dtrmm_mkl_st.x \
test_ctrmm_mkl_st.x \
test_ztrmm_mkl_st.x \
test_strsm_mkl_st.x \
test_dtrsm_mkl_st.x \
test_ctrsm_mkl_st.x \
test_ztrsm_mkl_st.x
openblas-st: $(OPENBLAS_ST_BINS)
openblas-1s: $(OPENBLAS_1S_BINS)
openblas-2s: $(OPENBLAS_2S_BINS)
mkl-mt: \
test_sgemm_mkl_mt.x \
test_dgemm_mkl_mt.x \
test_cgemm_mkl_mt.x \
test_zgemm_mkl_mt.x \
test_shemm_mkl_mt.x \
test_dhemm_mkl_mt.x \
test_chemm_mkl_mt.x \
test_zhemm_mkl_mt.x \
test_sherk_mkl_mt.x \
test_dherk_mkl_mt.x \
test_cherk_mkl_mt.x \
test_zherk_mkl_mt.x \
test_strmm_mkl_mt.x \
test_dtrmm_mkl_mt.x \
test_ctrmm_mkl_mt.x \
test_ztrmm_mkl_mt.x \
test_strsm_mkl_mt.x \
test_dtrsm_mkl_mt.x \
test_ctrsm_mkl_mt.x \
test_ztrsm_mkl_mt.x
vendor-st: $(VENDOR_ST_BINS)
vendor-1s: $(VENDOR_1S_BINS)
vendor-2s: $(VENDOR_2S_BINS)
mkl-st: vendor-st
mkl-1s: vendor-1s
mkl-2s: vendor-2s
armpl-st: vendor-st
armpl-1s: vendor-1s
armpl-2s: vendor-2s
# Mark the object files as intermediate so that make will remove them
# automatically after building the binaries on which they depend.
.INTERMEDIATE: $(BLIS_NAT_ST_OBJS) $(OPENBLAS_ST_OBJS) $(VENDOR_ST_OBJS)
.INTERMEDIATE: $(BLIS_NAT_1S_OBJS) $(OPENBLAS_1S_OBJS) $(VENDOR_1S_OBJS)
.INTERMEDIATE: $(BLIS_NAT_2S_OBJS) $(OPENBLAS_2S_OBJS) $(VENDOR_2S_OBJS)
# --Object file rules --
$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
$(CC) $(CFLAGS) -c $< -o $@
#$(TEST_OBJ_PATH)/%.o: $(TEST_SRC_PATH)/%.c
# $(CC) $(CFLAGS) -c $< -o $@
# blis 3mhw
test_z%_3mhw_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D3MHW) $(STR_3MHW) $(STR_ST) -c $< -o $@
# A function to return the datatype cpp macro def from the datatype
# character.
get-dt-cpp = -DDT=bli_$(1)type
test_c%_3mhw_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D3MHW) $(STR_3MHW) $(STR_ST) -c $< -o $@
# A function to return other cpp macros that help the test driver
# identify the implementation.
get-bl-cpp = $(strip \
$(if $(findstring blis,$(1)),$(STR_NAT) $(BLI_DEF),\
$(if $(findstring openblas,$(1)),$(STR_OBL) $(BLA_DEF),\
$(STR_VEN) $(BLA_DEF))))
test_z%_3mhw_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D3MHW) $(STR_3MHW) $(STR_MT) -c $< -o $@
define make-st-rule
test_$(1)$(2)_$(PS_MAX)_$(3)_st.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3)) $(DNAT) $(STR_ST) -c $$< -o $$@
endef
test_c%_3mhw_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D3MHW) $(STR_3MHW) $(STR_MT) -c $< -o $@
define make-1s-rule
test_$(1)$(2)_$(P1_MAX)_$(3)_1s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_1S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3)) $(DNAT) $(STR_1S) -c $$< -o $$@
endef
# blis 3m1
test_z%_3m1_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D3M1) $(STR_3M1) $(STR_ST) -c $< -o $@
define make-2s-rule
test_$(1)$(2)_$(P2_MAX)_$(3)_2s.o: test_$(op).c Makefile
$(CC) $(CFLAGS) $(PDEF_2S) $(call get-dt-cpp,$(1)) $(call get-bl-cpp,$(3)) $(DNAT) $(STR_2S) -c $$< -o $$@
endef
test_c%_3m1_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D3M1) $(STR_3M1) $(STR_ST) -c $< -o $@
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(IMPLS),$(eval $(call make-st-rule,$(dt),$(op),$(im))))))
test_z%_3m1_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D3M1) $(STR_3M1) $(STR_MT) -c $< -o $@
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(IMPLS),$(eval $(call make-1s-rule,$(dt),$(op),$(im))))))
test_c%_3m1_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D3M1) $(STR_3M1) $(STR_MT) -c $< -o $@
# blis 4mhw
test_z%_4mhw_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D4MHW) $(STR_4MHW) $(STR_ST) -c $< -o $@
test_c%_4mhw_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D4MHW) $(STR_4MHW) $(STR_ST) -c $< -o $@
test_z%_4mhw_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D4MHW) $(STR_4MHW) $(STR_MT) -c $< -o $@
test_c%_4mhw_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4MHW) $(STR_4MHW) $(STR_MT) -c $< -o $@
# blis 4m1b
test_z%_4m1b_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D4M1B) $(STR_4M1B) $(STR_ST) -c $< -o $@
test_c%_4m1b_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D4M1B) $(STR_4M1B) $(STR_ST) -c $< -o $@
test_z%_4m1b_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D4M1B) $(STR_4M1B) $(STR_MT) -c $< -o $@
test_c%_4m1b_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4M1B) $(STR_4M1B) $(STR_MT) -c $< -o $@
# blis 4m1a
test_z%_4m1a_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_ST) -c $< -o $@
test_c%_4m1a_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_ST) -c $< -o $@
test_z%_4m1a_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_MT) -c $< -o $@
test_c%_4m1a_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D4M1A) $(STR_4M1A) $(STR_MT) -c $< -o $@
# blis 1m
test_z%_1m_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@
test_c%_1m_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_ST) -c $< -o $@
test_z%_1m_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@
test_c%_1m_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(D1M) $(STR_1M) $(STR_MT) -c $< -o $@
# blis asm
test_d%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_s%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_z%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_c%_asm_blis_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_ST) -c $< -o $@
test_d%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_s%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_z%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
test_c%_asm_blis_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLI_DEF) $(DNAT) $(STR_NAT) $(STR_MT) -c $< -o $@
# openblas
test_d%_openblas_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@
test_s%_openblas_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@
test_z%_openblas_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@
test_c%_openblas_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@
test_d%_openblas_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@
test_s%_openblas_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@
test_z%_openblas_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@
test_c%_openblas_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@
# mkl
test_d%_mkl_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@
test_s%_mkl_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@
test_z%_mkl_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@
test_c%_mkl_st.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@
test_d%_mkl_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
test_s%_mkl_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
test_z%_mkl_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
test_c%_mkl_mt.o: test_%.c Makefile
$(CC) $(CFLAGS) $(PDEF_MT) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@
$(foreach dt,$(DTS), \
$(foreach op,$(OPS), \
$(foreach im,$(IMPLS),$(eval $(call make-2s-rule,$(dt),$(op),$(im))))))
# -- Executable file rules --
@@ -558,23 +349,34 @@ test_c%_mkl_mt.o: test_%.c Makefile
# compatibility layer. This prevents BLIS from inadvertently getting called
# for the BLAS routines we are trying to test with.
test_%_openblas_st.x: test_%_openblas_st.o $(LIBBLIS_LINK)
$(LINKER) $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_$(PS_MAX)_openblas_st.x: test_%_$(PS_MAX)_openblas_st.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(OPENBLAS_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_openblas_mt.x: test_%_openblas_mt.o $(LIBBLIS_LINK)
$(LINKER) $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_$(P1_MAX)_openblas_1s.x: test_%_$(P1_MAX)_openblas_1s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_mkl_st.x: test_%_mkl_st.o $(LIBBLIS_LINK)
$(LINKER) $< $(MKL_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_$(P2_MAX)_openblas_2s.x: test_%_$(P2_MAX)_openblas_2s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(OPENBLASP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_mkl_mt.x: test_%_mkl_mt.o $(LIBBLIS_LINK)
$(LINKER) $< $(MKLP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_blis_st.x: test_%_blis_st.o $(LIBBLIS_LINK)
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_$(PS_MAX)_vendor_st.x: test_%_$(PS_MAX)_vendor_st.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(VENDOR_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_blis_mt.x: test_%_blis_mt.o $(LIBBLIS_LINK)
$(LINKER) $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@
test_%_$(P1_MAX)_vendor_1s.x: test_%_$(P1_MAX)_vendor_1s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_vendor_2s.x: test_%_$(P2_MAX)_vendor_2s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(VENDORP_LIB) $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(PS_MAX)_asm_blis_st.x: test_%_$(PS_MAX)_asm_blis_st.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P1_MAX)_asm_blis_1s.x: test_%_$(P1_MAX)_asm_blis_1s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
test_%_$(P2_MAX)_asm_blis_2s.x: test_%_$(P2_MAX)_asm_blis_2s.o $(LIBBLIS_LINK)
$(LINKER) $(strip $< $(LIBBLIS_LINK) $(LDFLAGS) -o $@)
# -- Clean rules --

View File

@@ -1,9 +0,0 @@
% tx2
plot_panel_4x5(2.2,8,1, '../results/tx2/st', 'tx2', 'ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,28,'../results/tx2/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,56,'../results/tx2/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
% skx
plot_panel_4x5(2.0,32,1,'../results/skx/st/20190218','skx','MKL'); close; clear all;
plot_panel_4x5(2.0,32,26,'../results/skx/jc2ic13/20190218','skx_jc2ic13','MKL'); close; clear all;
plot_panel_4x5(2.0,32,52,'../results/skx/jc4ic13/20190218','skx_jc4ic13','MKL'); close; clear all;

View File

@@ -99,7 +99,7 @@ vend_ln = line( x_axis( :, 1 ), data_vend( :, flopscol ) / nth, ...
xlim( ax1, [x_begin x_end] );
ylim( ax1, [y_begin y_end] );
if x_end == 10000 || x_end == 8000
if x_end == 10000 || x_end == 8000 || x_end == 6000
x_tick2 = x_end - 2000;
x_tick1 = x_tick2/2;
xticks( ax1, [ x_tick1 x_tick2 ] );

View File

@@ -1,6 +1,7 @@
function r_val = plot_panel_4x5( cfreq, ...
dflopspercycle, ...
nth, ...
thr_str, ...
dirpath, ...
arch_str, ...
vend_str )
@@ -12,18 +13,12 @@ function r_val = plot_panel_4x5( cfreq, ...
% results.
filetemp_blis = '%s/output_%s_%s_asm_blis.m';
filetemp_open = '%s/output_%s_%s_openblas.m';
filetemp_mkl = '%s/output_%s_%s_mkl.m';
filetemp_vend = '%s/output_%s_%s_vendor.m';
% Create a variable name "template" for the variables contained in the
% files outlined above.
vartemp = 'data_%s_%s_%s( :, : )';
if nth == 1
thr_str = 'st';
else
thr_str = 'mt';
end
% Define the datatypes and operations we will be plotting.
dts = [ 's' 'd' 'c' 'z' ];
ops( 1, : ) = 'gemm';
@@ -63,26 +58,26 @@ for opi = 1:n_opnames
% Construct filenames for the data files from templates.
file_blis = sprintf( filetemp_blis, dirpath, thr_str, opname );
file_open = sprintf( filetemp_open, dirpath, thr_str, opname );
file_mkl = sprintf( filetemp_mkl, dirpath, thr_str, opname );
file_vend = sprintf( filetemp_vend, dirpath, thr_str, opname );
% Load the data files.
%str = sprintf( ' Loading %s', file_blis ); disp(str);
run( file_blis )
%str = sprintf( ' Loading %s', file_open ); disp(str);
run( file_open )
%str = sprintf( ' Loading %s', file_mkl ); disp(str);
run( file_mkl )
%str = sprintf( ' Loading %s', file_vend ); disp(str);
run( file_vend )
% Construct variable names for the variables in the data files.
var_blis = sprintf( vartemp, thr_str, opname, 'asm_blis' );
var_open = sprintf( vartemp, thr_str, opname, 'openblas' );
var_vend = sprintf( vartemp, thr_str, opname, 'mkl' );
var_vend = sprintf( vartemp, thr_str, opname, 'vendor' );
% Use eval() to instantiate the variable names constructed above,
% copying each to a simplified name.
data_blis = eval( var_blis ); % e.g. data_st_sgemm_asm_blis( :, : );
data_open = eval( var_open ); % e.g. data_st_sgemm_openblas( :, : );
data_vend = eval( var_vend ); % e.g. data_st_sgemm_mkl( :, : );
data_vend = eval( var_vend ); % e.g. data_st_sgemm_vendor( :, : );
% Plot one result in an m x n grid of plots, via the subplot()
% function.

9
test/3m4m/matlab/runme.m Normal file
View File

@@ -0,0 +1,9 @@
% tx2
plot_panel_4x5(2.2,8,1, 'st','../results/tx2/st', 'tx2', 'ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,28,'1s','../results/tx2/jc4ic7','tx2_jc4ic7','ARMPL'); close; clear all;
plot_panel_4x5(2.2,8,56,'2s','../results/tx2/jc8ic7','tx2_jc8ic7','ARMPL'); close; clear all;
% skx
plot_panel_4x5(2.0,32,1, 'st','../results/skx/st/20190218', 'skx', 'MKL'); close; clear all;
plot_panel_4x5(2.0,32,26,'1s','../results/skx/jc2ic13/20190218','skx_jc2ic13','MKL'); close; clear all;
plot_panel_4x5(2.0,32,52,'2s','../results/skx/jc4ic13/20190218','skx_jc4ic13','MKL'); close; clear all;

View File

@@ -3,37 +3,33 @@
# File pefixes.
exec_root="test"
out_root="output"
delay=0.1
#sys="blis"
#sys="stampede2"
sys="lonestar5"
#sys="lonestar5"
#sys="ul252"
sys="ul264"
# Bind threads to processors.
#export OMP_PROC_BIND=true
#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
# Modify LD_LIBRARY_PATH.
if [ ${sys} = "blis" ]; then
export GOMP_CPU_AFFINITY="0 1 2 3"
jc_nt=1 # 5th loop
ic_nt=4 # 3rd loop
jr_nt=1 # 2nd loop
ir_nt=1 # 1st loop
nt=4
threads="jc2ic2jr1_4000
jc2ic2jr1_6000"
elif [ ${sys} = "stampede2" ]; then
echo "Need to set GOMP_CPU_AFFINITY."
exit 1
jc_nt=4 # 5th loop
ic_nt=12 # 3rd loop
jr_nt=1 # 2nd loop
ir_nt=1 # 1st loop
nt=48
threads="jc4ic6jr1_6000
jc4ic12jr1_8000"
elif [ ${sys} = "lonestar5" ]; then
@@ -42,148 +38,115 @@ elif [ ${sys} = "lonestar5" ]; then
# A hack to use libiomp5 with gcc.
#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
# runner-up:
#jc_nt=6 # 5th loop
#ic_nt=4 # 3rd loop
#jr_nt=1 # 2nd loop
jc_nt=2 # 5th loop
ic_nt=3 # 3rd loop
jr_nt=2 # 2nd loop
ir_nt=1 # 1st loop
nt=12
threads="jc2ic3jr2_6000
jc4ic3jr2_8000"
elif [ ${sys} = "ul252" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
#jc_nt=4 # 5th loop
jc_nt=2 # 5th loop
ic_nt=13 # 3rd loop
jr_nt=1 # 2nd loop
ir_nt=1 # 1st loop
#nt=52
nt=26
threads="jc2ic13jr1_6000
jc4ic13jr1_8000"
elif [ ${sys} = "ul264" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
threads="jc1ic8jr4_6000
jc2ic8jr4_8000"
fi
echo "Setting BLIS threading params for ${sys}: jc${jc_nt}ic${ic_nt}jr${jr_nt}."
# Datatypes to test.
test_dts="d s z c"
# Operations to test.
test_ops="gemm hemm herk trmm trsm"
test_ops="gemm"
# Implementations to test.
impls="all"
#impls="other"
#impls="blis"
if [ "${impls}" = "blis" ]; then
test_impls="asm_blis"
elif [ "${impls}" = "other" ]; then
test_impls="openblas vendor"
else
test_impls="openblas asm_blis vendor"
fi
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
# restore the value.
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
# Threadedness to test.
threads="mt"
threads_r="mt"
#threads="st"
#threads_r="st"
# Datatypes to test.
dts=""
dts_r=""
dts="z c"
dts_r="d s"
# Operations to test.
l3_ops="gemm hemm herk trmm trsm"
test_ops="${l3_ops}"
test_ops_r="${l3_ops}"
# Complex domain implementations to test.
#test_impls="3mhw_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis"
#test_impls="openblas mkl asm_blis"
# Implementations to test.
impls="allasm"
if [ ${impls} = "allasm" ]; then
test_impls_r="openblas asm_blis mkl"
test_impls="openblas asm_blis mkl"
elif [ ${impls} = "comp" ]; then
test_impls_r="openblas mkl"
test_impls="openblas mkl"
elif [ ${impls} = "blis" ]; then
test_impls_r="asm_blis"
test_impls="asm_blis"
fi
# First perform real test cases.
for th in ${threads_r}; do
for dt in ${dts_r}; do
for im in ${test_impls_r}; do
for op in ${test_ops_r}; do
# Set the number of threads according to th.
if [ ${th} = "mt" ]; then
export BLIS_JC_NT=${jc_nt}
export BLIS_IC_NT=${ic_nt}
export BLIS_JR_NT=${jr_nt}
export BLIS_IR_NT=${ir_nt}
export OPENBLAS_NUM_THREADS=${nt}
export MKL_NUM_THREADS=${nt}
export nt_use=${nt}
# Unset GOMP_CPU_AFFINITY for OpenBLAS.
if [ ${im} = "openblas" ]; then
unset GOMP_CPU_AFFINITY
else
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
fi
else
export BLIS_JC_NT=1
export BLIS_IC_NT=1
export BLIS_JR_NT=1
export BLIS_IR_NT=1
export OPENBLAS_NUM_THREADS=1
export MKL_NUM_THREADS=1
export nt_use=1
fi
# Construct the name of the test executable.
exec_name="${exec_root}_${dt}${op}_${im}_${th}.x"
# Construct the name of the output file.
out_file="${out_root}_${th}_${dt}${op}_${im}.m"
echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
# Run executable.
./${exec_name} > ${out_file}
sleep 1
done
done
done
done
# Now perform complex test cases.
for th in ${threads}; do
for dt in ${dts}; do
# Start with one way of parallelism in each loop. We will now begin
# parsing the 'th' variable to update one or more of these threading
# parameters.
jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
# Strip everything before and after the underscore so that what remains
# is the problem size and threading parameter string, respectively.
psize=${th##*_}; thinfo=${th%%_*}
# Identify each threading parameter and insert a space before it.
thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
nt=1
for loopnum in ${thsep}; do
# Given the current string, which identifies a loop and the
# number of ways of parallelism for that loop, strip out
# the ways and loop separately to identify each.
loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
# Construct a string that we can evaluate to set the number
# of ways of parallelism for the current loop.
loop_nt_eq_num="${loop}_nt=${num}"
# Update the total number of threads.
nt=$(expr ${nt} \* ${num})
# Evaluate the string to assign the ways to the variable.
eval ${loop_nt_eq_num}
done
echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
for dt in ${test_dts}; do
for im in ${test_impls}; do
for op in ${test_ops}; do
# Find the threading suffix by probing the executable.
binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
suf_ext=${binname##*_}
suf=${suf_ext%%.*}
#echo "found file: ${binname} with suffix ${suf}"
# Set the number of threads according to th.
if [ ${th} = "mt" ]; then
if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
export BLIS_JC_NT=${jc_nt}
export BLIS_PC_NT=${pc_nt}
export BLIS_IC_NT=${ic_nt}
export BLIS_JR_NT=${jr_nt}
export BLIS_IR_NT=${ir_nt}
@@ -191,9 +154,11 @@ for th in ${threads}; do
export MKL_NUM_THREADS=${nt}
export nt_use=${nt}
# Unset GOMP_CPU_AFFINITY for OpenBLAS.
# Multithreaded OpenBLAS seems to have a problem running
# properly if GOMP_CPU_AFFINITY is set. So we temporarily
# unset it here if we are about to execute OpenBLAS, but
# otherwise restore it.
if [ ${im} = "openblas" ]; then
unset GOMP_CPU_AFFINITY
else
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
@@ -201,6 +166,7 @@ for th in ${threads}; do
else
export BLIS_JC_NT=1
export BLIS_PC_NT=1
export BLIS_IC_NT=1
export BLIS_JR_NT=1
export BLIS_IR_NT=1
@@ -210,19 +176,21 @@ for th in ${threads}; do
fi
# Construct the name of the test executable.
exec_name="${exec_root}_${dt}${op}_${im}_${th}.x"
exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
# Construct the name of the output file.
out_file="${out_root}_${th}_${dt}${op}_${im}.m"
out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
echo "Running ./${exec_name} > ${out_file}"
# Run executable.
./${exec_name} > ${out_file}
#./${exec_name} > ${out_file}
sleep 1
sleep ${delay}
done
done
done
done

View File

@@ -44,7 +44,7 @@ int main( int argc, char** argv )
obj_t alpha, beta;
dim_t m, n, k;
dim_t p;
dim_t p_begin, p_end, p_inc;
dim_t p_begin, p_max, p_inc;
int m_input, n_input, k_input;
ind_t ind;
num_t dt;
@@ -70,7 +70,7 @@ int main( int argc, char** argv )
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
@@ -116,12 +116,9 @@ int main( int argc, char** argv )
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%cgemm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
#endif
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
@@ -129,7 +126,7 @@ int main( int argc, char** argv )
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -157,7 +154,6 @@ int main( int argc, char** argv )
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
@@ -173,7 +169,6 @@ int main( int argc, char** argv )
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
@@ -190,107 +185,106 @@ int main( int argc, char** argv )
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
sgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
dgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
cgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zgemm_( &f77_transa,
//zgemm3m_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
zgemm_( &f77_transa,
&f77_transb,
&mm,
&nn,
&kk,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
@@ -298,7 +292,6 @@ int main( int argc, char** argv )
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
@@ -306,11 +299,7 @@ int main( int argc, char** argv )
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%cgemm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
#endif
printf( "data_%s_%cgemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:4 ) = [ %4lu %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,

View File

@@ -44,7 +44,7 @@ int main( int argc, char** argv )
obj_t alpha, beta;
dim_t m, n;
dim_t p;
dim_t p_begin, p_end, p_inc;
dim_t p_begin, p_max, p_inc;
int m_input, n_input;
ind_t ind;
num_t dt;
@@ -70,7 +70,7 @@ int main( int argc, char** argv )
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
@@ -115,19 +115,16 @@ int main( int argc, char** argv )
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%chemm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
#endif
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -161,7 +158,6 @@ int main( int argc, char** argv )
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
@@ -177,7 +173,6 @@ int main( int argc, char** argv )
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "b", &b, "%4.1f", "" );
@@ -195,98 +190,98 @@ int main( int argc, char** argv )
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* bp = bli_obj_buffer( &b );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
ssymm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
ssymm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* bp = bli_obj_buffer( &b );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dsymm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
dsymm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* bp = bli_obj_buffer( &b );
scomplex* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
chemm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
chemm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int nn = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldb = bli_obj_col_stride( &b );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* bp = bli_obj_buffer( &b );
dcomplex* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zhemm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
zhemm_( &f77_side,
&f77_uploa,
&mm,
&nn,
alphap,
ap, &lda,
bp, &ldb,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
@@ -294,7 +289,6 @@ int main( int argc, char** argv )
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
@@ -305,11 +299,7 @@ int main( int argc, char** argv )
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%chemm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
#endif
printf( "data_%s_%chemm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,

View File

@@ -46,7 +46,7 @@ int main( int argc, char** argv )
obj_t alpha, beta;
dim_t m, k;
dim_t p;
dim_t p_begin, p_end, p_inc;
dim_t p_begin, p_max, p_inc;
int m_input, k_input;
ind_t ind;
num_t dt, dt_real;
@@ -73,7 +73,7 @@ int main( int argc, char** argv )
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
@@ -118,19 +118,16 @@ int main( int argc, char** argv )
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
#endif
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -162,7 +159,6 @@ int main( int argc, char** argv )
bli_setsc( (2.0/1.0), 0.0, &alpha );
bli_setsc( (1.0/1.0), 0.0, &beta );
bli_copym( &c, &c_save );
#if 0 //def BLIS
@@ -176,10 +172,8 @@ int main( int argc, char** argv )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
@@ -194,86 +188,86 @@ int main( int argc, char** argv )
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* betap = bli_obj_buffer( &beta );
float* cp = bli_obj_buffer( &c );
ssyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
ssyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* betap = bli_obj_buffer( &beta );
double* cp = bli_obj_buffer( &c );
dsyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
float* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
dsyrk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
float* betap = bli_obj_buffer( &beta );
scomplex* cp = bli_obj_buffer( &c );
cherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
double* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
cherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width_after_trans( &a );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
double* betap = bli_obj_buffer( &beta );
dcomplex* cp = bli_obj_buffer( &c );
zherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
zherk_( &f77_uploc,
&f77_transa,
&mm,
&kk,
alphap,
ap, &lda,
betap,
cp, &ldc );
}
#endif
#ifdef PRINT
@@ -281,7 +275,6 @@ int main( int argc, char** argv )
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
@@ -289,11 +282,7 @@ int main( int argc, char** argv )
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%cherk_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
#endif
printf( "data_%s_%cherk_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,

View File

@@ -46,7 +46,7 @@ int main( int argc, char** argv )
obj_t alpha;
dim_t m, n;
dim_t p;
dim_t p_begin, p_end, p_inc;
dim_t p_begin, p_max, p_inc;
int m_input, n_input;
ind_t ind;
num_t dt;
@@ -76,7 +76,7 @@ int main( int argc, char** argv )
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
@@ -133,19 +133,16 @@ int main( int argc, char** argv )
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
#endif
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -188,10 +185,8 @@ int main( int argc, char** argv )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
@@ -206,86 +201,86 @@ int main( int argc, char** argv )
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
strmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
strmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
dtrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
dtrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
ctrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ctrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ztrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
ztrmm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
#endif
#ifdef PRINT
@@ -293,7 +288,6 @@ int main( int argc, char** argv )
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
@@ -304,11 +298,7 @@ int main( int argc, char** argv )
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%ctrmm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
#endif
printf( "data_%s_%ctrmm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,

View File

@@ -46,7 +46,7 @@ int main( int argc, char** argv )
obj_t alpha;
dim_t m, n;
dim_t p;
dim_t p_begin, p_end, p_inc;
dim_t p_begin, p_max, p_inc;
int m_input, n_input;
ind_t ind;
num_t dt;
@@ -76,7 +76,7 @@ int main( int argc, char** argv )
ind = IND;
p_begin = P_BEGIN;
p_end = P_END;
p_max = P_MAX;
p_inc = P_INC;
m_input = -1;
@@ -133,19 +133,16 @@ int main( int argc, char** argv )
// Begin with initializing the last entry to zero so that
// matlab allocates space for the entire array once up-front.
for ( p = p_begin; p + p_inc <= p_end; p += p_inc ) ;
#ifdef BLIS
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
#endif
for ( p = p_begin; p + p_inc <= p_max; p += p_inc ) ;
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )0,
( unsigned long )0, 0.0 );
for ( p = p_begin; p <= p_end; p += p_inc )
for ( p = p_begin; p <= p_max; p += p_inc )
{
if ( m_input < 0 ) m = p / ( dim_t )abs(m_input);
@@ -192,10 +189,8 @@ int main( int argc, char** argv )
{
bli_copym( &c_save, &c );
dtime = bli_clock();
#ifdef PRINT
bli_printm( "a", &a, "%4.1f", "" );
bli_printm( "c", &c, "%4.1f", "" );
@@ -210,86 +205,86 @@ int main( int argc, char** argv )
#else
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
if ( bli_is_float( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
float* alphap = bli_obj_buffer( &alpha );
float* ap = bli_obj_buffer( &a );
float* cp = bli_obj_buffer( &c );
strsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
strsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_double( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
double* alphap = bli_obj_buffer( &alpha );
double* ap = bli_obj_buffer( &a );
double* cp = bli_obj_buffer( &c );
dtrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
dtrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_scomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
scomplex* alphap = bli_obj_buffer( &alpha );
scomplex* ap = bli_obj_buffer( &a );
scomplex* cp = bli_obj_buffer( &c );
ctrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ctrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
else if ( bli_is_dcomplex( dt ) )
{
f77_int mm = bli_obj_length( &c );
f77_int kk = bli_obj_width( &c );
f77_int lda = bli_obj_col_stride( &a );
f77_int ldc = bli_obj_col_stride( &c );
dcomplex* alphap = bli_obj_buffer( &alpha );
dcomplex* ap = bli_obj_buffer( &a );
dcomplex* cp = bli_obj_buffer( &c );
ztrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
ztrsm_( &f77_side,
&f77_uploa,
&f77_transa,
&f77_diaga,
&mm,
&kk,
alphap,
ap, &lda,
cp, &ldc );
}
#endif
#ifdef PRINT
@@ -297,7 +292,6 @@ int main( int argc, char** argv )
exit(1);
#endif
dtime_save = bli_clock_min_diff( dtime_save, dtime );
}
@@ -308,11 +302,7 @@ int main( int argc, char** argv )
if ( bli_is_complex( dt ) ) gflops *= 4.0;
#ifdef BLIS
printf( "data_%s_%ctrsm_%s_blis", THR_STR, dt_ch, STR );
#else
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
#endif
printf( "data_%s_%ctrsm_%s", THR_STR, dt_ch, STR );
printf( "( %2lu, 1:3 ) = [ %4lu %4lu %7.2f ];\n",
( unsigned long )(p - p_begin + 1)/p_inc + 1,
( unsigned long )m,