Files
blis/test/3/runme.sh
Nicholai Tukanov b426f9e04e POWER9 DGEMM (#355)
Implemented and registered power9 dgemm ukernel.

Details:
- Implemented 12x6 dgemm microkernel for power9. This microkernel 
  assumes that elements of B have been duplicated/broadcast during the
  packing step. The microkernel uses a column orientation for its 
  microtile vector registers and thus implements column storage and 
  general stride IO cases. (A row storage IO case via in-register
  transposition may be added at a future date.) It should be noted that 
  we recommend using this microkernel with gcc and *not* xlc, as issues 
  with the latter cropped up during development, including but not 
  limited to slightly incompatible vector register mnemonics in the GNU 
  extended inline assembly clobber list.
2019-11-01 17:57:03 -05:00

230 lines
6.3 KiB
Bash
Executable File

#!/bin/bash
# File pefixes.
exec_root="test"
out_root="output"
delay=0.1
sys="blis"
#sys="stampede2"
#sys="lonestar5"
#sys="ul252"
#sys="ul264"
# Bind threads to processors.
#export OMP_PROC_BIND=true
#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
if [ ${sys} = "blis" ]; then
export GOMP_CPU_AFFINITY="0 1 2 3"
threads="jc1ic1jr1_2400
jc2ic3jr2_6000
jc4ic3jr2_8000"
elif [ ${sys} = "stampede2" ]; then
echo "Need to set GOMP_CPU_AFFINITY."
exit 1
threads="jc1ic1jr1_2400
jc4ic6jr1_6000
jc4ic12jr1_8000"
elif [ ${sys} = "lonestar5" ]; then
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
# A hack to use libiomp5 with gcc.
#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
threads="jc1ic1jr1_2400
jc2ic3jr2_6000
jc4ic3jr2_8000"
elif [ ${sys} = "ul252" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
threads="jc1ic1jr1_2400
jc2ic13jr1_6000
jc4ic13jr1_8000"
elif [ ${sys} = "ul264" ]; then
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
threads="jc1ic1jr1_2400
jc1ic8jr4_6000
jc2ic8jr4_8000"
fi
# Datatypes to test.
test_dts="d " #s z c"
# Operations to test.
test_ops="gemm "#hemm herk trmm trsm"
# Implementations to test.
#impls="all"
#impls="other"
impls="blis"
if [ "${impls}" = "blis" ]; then
test_impls="asm_blis"
elif [ "${impls}" = "eigen" ]; then
test_impls="eigen"
elif [ "${impls}" = "other" ]; then
test_impls="openblas vendor"
elif [ "${impls}" = "eigen" ]; then
test_impls="eigen"
else
test_impls="openblas asm_blis vendor"
fi
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
# restore the value.
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
# First perform real test cases.
for th in ${threads}; do
# Start with one way of parallelism in each loop. We will now begin
# parsing the 'th' variable to update one or more of these threading
# parameters.
jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
# Strip everything before and after the underscore so that what remains
# is the problem size and threading parameter string, respectively.
psize=${th##*_}; thinfo=${th%%_*}
# Identify each threading parameter and insert a space before it.
thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
nt=1
for loopnum in ${thsep}; do
# Given the current string, which identifies a loop and the
# number of ways of parallelism for that loop, strip out
# the ways and loop separately to identify each.
loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
# Construct a string that we can evaluate to set the number
# of ways of parallelism for the current loop.
loop_nt_eq_num="${loop}_nt=${num}"
# Update the total number of threads.
nt=$(expr ${nt} \* ${num})
# Evaluate the string to assign the ways to the variable.
eval ${loop_nt_eq_num}
done
echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
for dt in ${test_dts}; do
for im in ${test_impls}; do
for op in ${test_ops}; do
# Eigen does not support multithreading for hemm, herk, trmm,
# or trsm. So if we're getting ready to execute an Eigen driver
# for one of these operations and nt > 1, we skip this test.
if [ "${im}" = "eigen" ] && \
[ "${op}" != "gemm" ] && \
[ "${nt}" != "1" ]; then
continue;
fi
# Find the threading suffix by probing the executable.
binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
suf_ext=${binname##*_}
suf=${suf_ext%%.*}
#echo "found file: ${binname} with suffix ${suf}"
# Set the number of threads according to th.
if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
# Set the threading parameters based on the implementation
# that we are preparing to run.
if [ "${im}" = "asm_blis" ]; then
unset OMP_NUM_THREADS
export BLIS_JC_NT=${jc_nt}
export BLIS_PC_NT=${pc_nt}
export BLIS_IC_NT=${ic_nt}
export BLIS_JR_NT=${jr_nt}
export BLIS_IR_NT=${ir_nt}
elif [ "${im}" = "openblas" ]; then
unset OMP_NUM_THREADS
export OPENBLAS_NUM_THREADS=${nt}
elif [ "${im}" = "eigen" ]; then
export OMP_NUM_THREADS=${nt}
elif [ "${im}" = "vendor" ]; then
unset OMP_NUM_THREADS
export MKL_NUM_THREADS=${nt}
fi
export nt_use=${nt}
# Multithreaded OpenBLAS seems to have a problem running
# properly if GOMP_CPU_AFFINITY is set. So we temporarily
# unset it here if we are about to execute OpenBLAS, but
# otherwise restore it.
if [ ${im} = "openblas" ]; then
unset GOMP_CPU_AFFINITY
else
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
fi
else
export BLIS_JC_NT=1
export BLIS_PC_NT=1
export BLIS_IC_NT=1
export BLIS_JR_NT=1
export BLIS_IR_NT=1
export OMP_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1
export MKL_NUM_THREADS=1
export nt_use=1
fi
# Construct the name of the test executable.
exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
# Construct the name of the output file.
out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
echo "Running ./${exec_name} > ${out_file}"
# Run executable.
./${exec_name} > ${out_file}
sleep ${delay}
done
done
done
done