diff --git a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c index 36281f543..b99ebda39 100644 --- a/frame/ind/oapi/bli_l3_3m4m1m_oapi.c +++ b/frame/ind/oapi/bli_l3_3m4m1m_oapi.c @@ -65,11 +65,18 @@ void PASTEMAC(opname,imeth) \ \ /* A temporary hack to easily specify the 1m algorithm (block-panel or panel-block). */ \ +/* if ( PASTEMAC(opname,imeth) == bli_gemm1m ) \ { \ bli_gemm1mbp( alpha, a, b, beta, c ); \ return; \ } \ + else if ( PASTEMAC(opname,imeth) == bli_gemm3m1 ) \ + { \ + bli_gemm1mpb( alpha, a, b, beta, c ); \ + return; \ + } \ +*/ \ \ /* Initialize a local context if the one provided is NULL. */ \ bli_cntx_init_local_if2( cname, imeth, dt, cntx, cntx_p ); \ diff --git a/test/3m4m/Makefile b/test/3m4m/Makefile index 0b13b8eb1..433e745a7 100644 --- a/test/3m4m/Makefile +++ b/test/3m4m/Makefile @@ -107,8 +107,9 @@ BLIS_LIB := $(BLIS_LIB_PATH)/libblis.a # BLAS library path(s). This is where the BLAS libraries reside. HOME_LIB_PATH := $(HOME)/flame/lib #MKL_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64 -MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 -ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 +#MKL_LIB_PATH := $(HOME)/intel/mkl/lib/intel64 +MKL_LIB_PATH := ${MKLROOT}/lib/intel64 +#ICC_LIB_PATH := /opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64 ACML_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_int64/lib ACMLP_LIB_PATH := $(HOME_LIB_PATH)/acml/5.3.1/gfortran64_fma4_mp_int64/lib @@ -168,7 +169,7 @@ CFLAGS += -I$(BLIS_INC_PATH) -I$(TEST_SRC_PATH) #-I$(ACML_INC_PATH) LINKER := $(CC) LDFLAGS := #-L/home/00146/field/gnu/gcc-4.8.2/lib64 -LDFLAGS += -lgfortran -lm -lpthread -fopenmp +LDFLAGS += -lgfortran -lm -lrt -lpthread -fopenmp # Datatype @@ -211,13 +212,13 @@ STR_ST := -DTHR_STR=\"st\" STR_MT := -DTHR_STR=\"mt\" # Problem size specification -PDEF_ST := -DP_BEGIN=100 \ - -DP_END=1000 \ - -DP_INC=100 - -PDEF_MT := -DP_BEGIN=100 \ +PDEF_ST := -DP_BEGIN=40 \ -DP_END=2000 \ - -DP_INC=100 + -DP_INC=40 + +PDEF_MT := -DP_BEGIN=200 \ + -DP_END=10000 \ + -DP_INC=200 @@ -296,6 +297,8 @@ openblas-gemm-st: \ test_zgemm_openblas_st.x openblas-gemm-mt: \ + test_sgemm_openblas_mt.x \ + test_dgemm_openblas_mt.x \ test_cgemm_openblas_mt.x \ test_zgemm_openblas_mt.x @@ -306,6 +309,8 @@ mkl-gemm-st: \ test_zgemm_mkl_st.x mkl-gemm-mt: \ + test_sgemm_mkl_mt.x \ + test_dgemm_mkl_mt.x \ test_cgemm_mkl_mt.x \ test_zgemm_mkl_mt.x @@ -316,6 +321,8 @@ acml-gemm-st: \ test_zgemm_acml_st.x acml-gemm-mt: \ + test_sgemm_acml_mt.x \ + test_dgemm_acml_mt.x \ test_cgemm_acml_mt.x \ test_zgemm_acml_mt.x @@ -468,6 +475,12 @@ test_z%_openblas_st.o: test_%.c test_c%_openblas_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_ST) -c $< -o $@ +test_d%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + +test_s%_openblas_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ + test_z%_openblas_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_OBL) $(STR_MT) -c $< -o $@ @@ -487,6 +500,12 @@ test_z%_mkl_st.o: test_%.c test_c%_mkl_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_ST) -c $< -o $@ +test_d%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + +test_s%_mkl_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ + test_z%_mkl_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_MKL) $(STR_MT) -c $< -o $@ @@ -506,6 +525,12 @@ test_z%_acml_st.o: test_%.c test_c%_acml_st.o: test_%.c $(CC) $(CFLAGS) $(PDEF_ST) $(DT_C) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_ST) -c $< -o $@ +test_d%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_D) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + +test_s%_acml_mt.o: test_%.c + $(CC) $(CFLAGS) $(PDEF_MT) $(DT_S) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ + test_z%_acml_mt.o: test_%.c $(CC) $(CFLAGS) $(PDEF_MT) $(DT_Z) $(BLA_DEF) $(DNAT) $(STR_ACML) $(STR_MT) -c $< -o $@ diff --git a/test/3m4m/runme.sh b/test/3m4m/runme.sh index 794f0ba00..3f5d89023 100755 --- a/test/3m4m/runme.sh +++ b/test/3m4m/runme.sh @@ -4,17 +4,21 @@ exec_root="test" out_root="output" -sys="blis" +#sys="blis" #sys="stampede" +sys="lonestar" #sys="wahlberg" # Bind threads to processors. #export OMP_PROC_BIND=true #export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15" #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" -export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" +#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7" #export GOMP_CPU_AFFINITY="0 2 4 6 1 3 5 7" #export GOMP_CPU_AFFINITY="0 4 1 5 2 6 3 7" +#export GOMP_CPU_AFFINITY="0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29 32 33 36 37 40 41 44 45" +#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23" +export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" # Modify LD_LIBRARY_PATH. if [ ${sys} = "blis" ]; then @@ -26,6 +30,11 @@ elif [ ${sys} = "stampede" ]; then # A hack to use libiomp5 with gcc. export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64" +elif [ ${sys} = "lonestar" ]; then + + # A hack to use libiomp5 with gcc. + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64" + elif [ ${sys} = "wahlberg" ]; then export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HOME/flame/lib/acml/5.3.1/gfortran64_int64/lib" @@ -49,6 +58,14 @@ elif [ ${sys} = "stampede" ]; then ir_nt=1 # 1st loop nt=16 +elif [ ${sys} = "lonestar" ]; then + + jc_nt=2 # 5th loop + ic_nt=12 # 3rd loop + jr_nt=1 # 2nd loop + ir_nt=1 # 1st loop + nt=24 + elif [ ${sys} = "wahlberg" ]; then jc_nt=1 # 5th loop @@ -59,8 +76,10 @@ elif [ ${sys} = "wahlberg" ]; then fi # Threadedness to test. -threads="st mt" # st mt" -threads_r="st mt" # mt" +#threads="mt" +#threads_r="mt" +threads="st" +threads_r="st" # Datatypes to test. dts="z c" @@ -82,6 +101,14 @@ elif [ ${sys} = "stampede" ]; then test_impls="openblas mkl asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" #test_impls="openblas mkl asm_blis" +elif [ ${sys} = "lonestar" ]; then + + test_impls="asm_blis 4mhw_blis 4m1a_blis 1m_blis 3m1_blis" + #test_impls="1m_blis 3m1_blis" + #test_impls="4m1a_blis" + #test_impls="mkl" + #test_impls="openblas mkl asm_blis" + elif [ ${sys} = "wahlberg" ]; then test_impls="openblas acml asm_blis 3mhw_blis 3m3_blis 3m2_blis 3m1_blis 4mhw_blis 4m1b_blis 4m1a_blis 1m_blis" @@ -90,7 +117,8 @@ fi # Real domain implementations to test. #test_impls_r="openblas mkl asm_blis" -test_impls_r="openblas asm_blis" +test_impls_r="asm_blis" +#test_impls_r="" # First perform real test cases. for th in ${threads_r}; do @@ -112,10 +140,11 @@ for th in ${threads_r}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" + # export MKL_NUM_THREADS=${nt} #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -124,7 +153,6 @@ for th in ${threads_r}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. @@ -165,10 +193,10 @@ for th in ${threads}; do # Unset GOMP_CPU_AFFINITY for MKL when using mkl_intel_thread. #if [ ${im} = "mkl" ]; then - # + # export GOMP_CPU_AFFINITY="" #else - # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" + # export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23" #fi else @@ -177,7 +205,6 @@ for th in ${threads}; do export BLIS_JR_NT=1 export BLIS_IR_NT=1 export OMP_NUM_THREADS=1 - #export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" fi # Construct the name of the test executable. diff --git a/test/3m4m/test_gemm.c b/test/3m4m/test_gemm.c index 7b16f584f..1f9ea036c 100644 --- a/test/3m4m/test_gemm.c +++ b/test/3m4m/test_gemm.c @@ -49,6 +49,7 @@ int main( int argc, char** argv ) dim_t p; dim_t p_begin, p_end, p_inc; int m_input, n_input, k_input; + ind_t ind; num_t dt; char dt_ch; int r, n_repeats; @@ -70,6 +71,8 @@ int main( int argc, char** argv ) dt = DT; + ind = IND; + p_begin = P_BEGIN; p_end = P_END; p_inc = P_INC; @@ -78,12 +81,21 @@ int main( int argc, char** argv ) n_input = -1; k_input = -1; -#if 0 + + // Supress compiler warnings about unused variable 'ind'. + ( void )ind; + +#if 1 cntx_t cntx; + ind_t ind_mod = ind; + + // A hack to use 3m1 as 1mpb (with 1m as 1mbp). + if ( ind == BLIS_3M1 ) ind_mod = BLIS_1M; + // Initialize a context for the current induced method and datatype. - bli_gemmind_cntx_init( IND, dt, &cntx ); + bli_gemmind_cntx_init( ind_mod, dt, &cntx ); // Set k to the kc blocksize for the current datatype. k_input = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, &cntx ); @@ -157,7 +169,7 @@ int main( int argc, char** argv ) #ifdef BLIS bli_ind_disable_all_dt( dt ); - bli_ind_enable_dt( IND, dt ); + bli_ind_enable_dt( ind, dt ); #endif dtime_save = DBL_MAX;