mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
Details: - Use compile-time implementations of Eigen in test_gemm.c via new EIGEN cpp macro, defined on command line. (Linking to Eigen's BLAS library is not necessary.) However, as of Eigen 3.3.7, Eigen only parallelizes the gemm operation and not hemm, herk, trmm, trsm, or any other level-3 operation. - Fixed a bug in trmm and trsm drivers whereby the wrong function (bli_does_trans()) was being called to determine whether the object for matrix A should be created for a left- or right-side case. This was corrected by changing the function to bli_is_left(), as is done in the hemm driver. - Added support for running Eigen test drivers from runme.sh.
231 lines
6.3 KiB
Bash
Executable File
231 lines
6.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# File pefixes.
|
|
exec_root="test"
|
|
out_root="output"
|
|
delay=0.1
|
|
|
|
sys="blis"
|
|
#sys="stampede2"
|
|
#sys="lonestar5"
|
|
#sys="ul252"
|
|
#sys="ul264"
|
|
|
|
# Bind threads to processors.
|
|
#export OMP_PROC_BIND=true
|
|
#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
|
|
#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
|
|
|
|
if [ ${sys} = "blis" ]; then
|
|
|
|
export GOMP_CPU_AFFINITY="0 1 2 3"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic3jr2_6000
|
|
jc4ic3jr2_8000"
|
|
|
|
elif [ ${sys} = "stampede2" ]; then
|
|
|
|
echo "Need to set GOMP_CPU_AFFINITY."
|
|
exit 1
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc4ic6jr1_6000
|
|
jc4ic12jr1_8000"
|
|
|
|
elif [ ${sys} = "lonestar5" ]; then
|
|
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
|
|
|
|
# A hack to use libiomp5 with gcc.
|
|
#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic3jr2_6000
|
|
jc4ic3jr2_8000"
|
|
|
|
elif [ ${sys} = "ul252" ]; then
|
|
|
|
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic13jr1_6000
|
|
jc4ic13jr1_8000"
|
|
|
|
elif [ ${sys} = "ul264" ]; then
|
|
|
|
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc1ic8jr4_6000
|
|
jc2ic8jr4_8000"
|
|
|
|
fi
|
|
|
|
# Datatypes to test.
|
|
test_dts="d s z c"
|
|
|
|
# Operations to test.
|
|
test_ops="gemm hemm herk trmm trsm"
|
|
|
|
# Implementations to test.
|
|
#impls="blis"
|
|
#impls="other"
|
|
impls="eigen"
|
|
#impls="all"
|
|
|
|
if [ "${impls}" = "blis" ]; then
|
|
|
|
test_impls="asm_blis"
|
|
|
|
elif [ "${impls}" = "eigen" ]; then
|
|
|
|
test_impls="eigen"
|
|
|
|
elif [ "${impls}" = "other" ]; then
|
|
|
|
test_impls="openblas vendor"
|
|
|
|
elif [ "${impls}" = "eigen" ]; then
|
|
|
|
test_impls="eigen"
|
|
|
|
else
|
|
|
|
test_impls="openblas asm_blis vendor"
|
|
fi
|
|
|
|
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
|
|
# restore the value.
|
|
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
|
|
|
|
|
|
# First perform real test cases.
|
|
for th in ${threads}; do
|
|
|
|
# Start with one way of parallelism in each loop. We will now begin
|
|
# parsing the 'th' variable to update one or more of these threading
|
|
# parameters.
|
|
jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
|
|
|
|
# Strip everything before and after the underscore so that what remains
|
|
# is the problem size and threading parameter string, respectively.
|
|
psize=${th##*_}; thinfo=${th%%_*}
|
|
|
|
# Identify each threading parameter and insert a space before it.
|
|
thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
|
|
|
|
nt=1
|
|
|
|
for loopnum in ${thsep}; do
|
|
|
|
# Given the current string, which identifies a loop and the
|
|
# number of ways of parallelism for that loop, strip out
|
|
# the ways and loop separately to identify each.
|
|
loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
|
|
num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
|
|
|
|
# Construct a string that we can evaluate to set the number
|
|
# of ways of parallelism for the current loop.
|
|
loop_nt_eq_num="${loop}_nt=${num}"
|
|
|
|
# Update the total number of threads.
|
|
nt=$(expr ${nt} \* ${num})
|
|
|
|
# Evaluate the string to assign the ways to the variable.
|
|
eval ${loop_nt_eq_num}
|
|
|
|
done
|
|
|
|
echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
|
|
|
|
|
|
for dt in ${test_dts}; do
|
|
|
|
for im in ${test_impls}; do
|
|
|
|
for op in ${test_ops}; do
|
|
|
|
# Eigen does not support multithreading for hemm, herk, trmm,
|
|
# or trsm. So if we're getting ready to execute an Eigen driver
|
|
# for one of these operations and nt > 1, we skip this test.
|
|
if [ "${im}" = "eigen" ] && \
|
|
[ "${op}" != "gemm" ] && \
|
|
[ "${nt}" != "1" ]; then
|
|
continue;
|
|
fi
|
|
|
|
# Find the threading suffix by probing the executable.
|
|
binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
|
|
suf_ext=${binname##*_}
|
|
suf=${suf_ext%%.*}
|
|
|
|
#echo "found file: ${binname} with suffix ${suf}"
|
|
|
|
# Set the number of threads according to th.
|
|
if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
|
|
|
|
# Set the threading parameters based on the implementation
|
|
# that we are preparing to run.
|
|
if [ "${im}" = "asm_blis" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export BLIS_JC_NT=${jc_nt}
|
|
export BLIS_PC_NT=${pc_nt}
|
|
export BLIS_IC_NT=${ic_nt}
|
|
export BLIS_JR_NT=${jr_nt}
|
|
export BLIS_IR_NT=${ir_nt}
|
|
elif [ "${im}" = "openblas" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export OPENBLAS_NUM_THREADS=${nt}
|
|
elif [ "${im}" = "eigen" ]; then
|
|
export OMP_NUM_THREADS=${nt}
|
|
elif [ "${im}" = "vendor" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export MKL_NUM_THREADS=${nt}
|
|
fi
|
|
export nt_use=${nt}
|
|
|
|
# Multithreaded OpenBLAS seems to have a problem running
|
|
# properly if GOMP_CPU_AFFINITY is set. So we temporarily
|
|
# unset it here if we are about to execute OpenBLAS, but
|
|
# otherwise restore it.
|
|
if [ ${im} = "openblas" ]; then
|
|
unset GOMP_CPU_AFFINITY
|
|
else
|
|
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
|
|
fi
|
|
else
|
|
|
|
export BLIS_JC_NT=1
|
|
export BLIS_PC_NT=1
|
|
export BLIS_IC_NT=1
|
|
export BLIS_JR_NT=1
|
|
export BLIS_IR_NT=1
|
|
export OMP_NUM_THREADS=1
|
|
export OPENBLAS_NUM_THREADS=1
|
|
export MKL_NUM_THREADS=1
|
|
export nt_use=1
|
|
fi
|
|
|
|
# Construct the name of the test executable.
|
|
exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
|
|
|
|
# Construct the name of the output file.
|
|
out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
|
|
|
|
#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
|
|
echo "Running ./${exec_name} > ${out_file}"
|
|
|
|
# Run executable.
|
|
./${exec_name} > ${out_file}
|
|
|
|
sleep ${delay}
|
|
|
|
done
|
|
done
|
|
done
|
|
done
|
|
|