mirror of
https://github.com/amd/blis.git
synced 2026-04-20 15:48:50 +00:00
Implemented and registered power9 dgemm ukernel. Details: - Implemented 12x6 dgemm microkernel for power9. This microkernel assumes that elements of B have been duplicated/broadcast during the packing step. The microkernel uses a column orientation for its microtile vector registers and thus implements column storage and general stride IO cases. (A row storage IO case via in-register transposition may be added at a future date.) It should be noted that we recommend using this microkernel with gcc and *not* xlc, as issues with the latter cropped up during development, including but not limited to slightly incompatible vector register mnemonics in the GNU extended inline assembly clobber list.
230 lines
6.3 KiB
Bash
Executable File
230 lines
6.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# File pefixes.
|
|
exec_root="test"
|
|
out_root="output"
|
|
delay=0.1
|
|
|
|
sys="blis"
|
|
#sys="stampede2"
|
|
#sys="lonestar5"
|
|
#sys="ul252"
|
|
#sys="ul264"
|
|
|
|
# Bind threads to processors.
|
|
#export OMP_PROC_BIND=true
|
|
#export GOMP_CPU_AFFINITY="0 2 4 6 8 10 12 14 16 18 20 22 1 3 5 7 9 11 13 15 17 19 21 23"
|
|
#export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103"
|
|
|
|
if [ ${sys} = "blis" ]; then
|
|
|
|
export GOMP_CPU_AFFINITY="0 1 2 3"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic3jr2_6000
|
|
jc4ic3jr2_8000"
|
|
|
|
elif [ ${sys} = "stampede2" ]; then
|
|
|
|
echo "Need to set GOMP_CPU_AFFINITY."
|
|
exit 1
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc4ic6jr1_6000
|
|
jc4ic12jr1_8000"
|
|
|
|
elif [ ${sys} = "lonestar5" ]; then
|
|
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23"
|
|
|
|
# A hack to use libiomp5 with gcc.
|
|
#export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/apps/intel/16.0.1.150/compilers_and_libraries_2016.1.150/linux/compiler/lib/intel64"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic3jr2_6000
|
|
jc4ic3jr2_8000"
|
|
|
|
elif [ ${sys} = "ul252" ]; then
|
|
|
|
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc2ic13jr1_6000
|
|
jc4ic13jr1_8000"
|
|
|
|
elif [ ${sys} = "ul264" ]; then
|
|
|
|
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/field/intel/mkl/lib/intel64"
|
|
export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63"
|
|
|
|
threads="jc1ic1jr1_2400
|
|
jc1ic8jr4_6000
|
|
jc2ic8jr4_8000"
|
|
|
|
fi
|
|
|
|
# Datatypes to test.
|
|
test_dts="d " #s z c"
|
|
|
|
# Operations to test.
|
|
test_ops="gemm "#hemm herk trmm trsm"
|
|
|
|
# Implementations to test.
|
|
#impls="all"
|
|
#impls="other"
|
|
impls="blis"
|
|
|
|
if [ "${impls}" = "blis" ]; then
|
|
|
|
test_impls="asm_blis"
|
|
|
|
elif [ "${impls}" = "eigen" ]; then
|
|
|
|
test_impls="eigen"
|
|
|
|
elif [ "${impls}" = "other" ]; then
|
|
|
|
test_impls="openblas vendor"
|
|
|
|
elif [ "${impls}" = "eigen" ]; then
|
|
|
|
test_impls="eigen"
|
|
|
|
else
|
|
|
|
test_impls="openblas asm_blis vendor"
|
|
fi
|
|
|
|
# Save a copy of GOMP_CPU_AFFINITY so that if we have to unset it, we can
|
|
# restore the value.
|
|
GOMP_CPU_AFFINITYsave=${GOMP_CPU_AFFINITY}
|
|
|
|
|
|
# First perform real test cases.
|
|
for th in ${threads}; do
|
|
|
|
# Start with one way of parallelism in each loop. We will now begin
|
|
# parsing the 'th' variable to update one or more of these threading
|
|
# parameters.
|
|
jc_nt=1; pc_nt=1; ic_nt=1; jr_nt=1; ir_nt=1
|
|
|
|
# Strip everything before and after the underscore so that what remains
|
|
# is the problem size and threading parameter string, respectively.
|
|
psize=${th##*_}; thinfo=${th%%_*}
|
|
|
|
# Identify each threading parameter and insert a space before it.
|
|
thsep=$(echo -e ${thinfo} | sed -e "s/\([jip][cr]\)/ \1/g" )
|
|
|
|
nt=1
|
|
|
|
for loopnum in ${thsep}; do
|
|
|
|
# Given the current string, which identifies a loop and the
|
|
# number of ways of parallelism for that loop, strip out
|
|
# the ways and loop separately to identify each.
|
|
loop=$(echo -e ${loopnum} | sed -e "s/[0-9]//g" )
|
|
num=$(echo -e ${loopnum} | sed -e "s/[a-z]//g" )
|
|
|
|
# Construct a string that we can evaluate to set the number
|
|
# of ways of parallelism for the current loop.
|
|
loop_nt_eq_num="${loop}_nt=${num}"
|
|
|
|
# Update the total number of threads.
|
|
nt=$(expr ${nt} \* ${num})
|
|
|
|
# Evaluate the string to assign the ways to the variable.
|
|
eval ${loop_nt_eq_num}
|
|
|
|
done
|
|
|
|
echo "Switching to: jc${jc_nt} pc${pc_nt} ic${ic_nt} jr${jr_nt} ir${ir_nt} (nt = ${nt}) p_max${psize}"
|
|
|
|
|
|
for dt in ${test_dts}; do
|
|
|
|
for im in ${test_impls}; do
|
|
|
|
for op in ${test_ops}; do
|
|
|
|
# Eigen does not support multithreading for hemm, herk, trmm,
|
|
# or trsm. So if we're getting ready to execute an Eigen driver
|
|
# for one of these operations and nt > 1, we skip this test.
|
|
if [ "${im}" = "eigen" ] && \
|
|
[ "${op}" != "gemm" ] && \
|
|
[ "${nt}" != "1" ]; then
|
|
continue;
|
|
fi
|
|
|
|
# Find the threading suffix by probing the executable.
|
|
binname=$(ls ${exec_root}_${dt}${op}_${psize}_${im}_*.x)
|
|
suf_ext=${binname##*_}
|
|
suf=${suf_ext%%.*}
|
|
|
|
#echo "found file: ${binname} with suffix ${suf}"
|
|
|
|
# Set the number of threads according to th.
|
|
if [ "${suf}" = "1s" ] || [ "${suf}" = "2s" ]; then
|
|
|
|
# Set the threading parameters based on the implementation
|
|
# that we are preparing to run.
|
|
if [ "${im}" = "asm_blis" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export BLIS_JC_NT=${jc_nt}
|
|
export BLIS_PC_NT=${pc_nt}
|
|
export BLIS_IC_NT=${ic_nt}
|
|
export BLIS_JR_NT=${jr_nt}
|
|
export BLIS_IR_NT=${ir_nt}
|
|
elif [ "${im}" = "openblas" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export OPENBLAS_NUM_THREADS=${nt}
|
|
elif [ "${im}" = "eigen" ]; then
|
|
export OMP_NUM_THREADS=${nt}
|
|
elif [ "${im}" = "vendor" ]; then
|
|
unset OMP_NUM_THREADS
|
|
export MKL_NUM_THREADS=${nt}
|
|
fi
|
|
export nt_use=${nt}
|
|
|
|
# Multithreaded OpenBLAS seems to have a problem running
|
|
# properly if GOMP_CPU_AFFINITY is set. So we temporarily
|
|
# unset it here if we are about to execute OpenBLAS, but
|
|
# otherwise restore it.
|
|
if [ ${im} = "openblas" ]; then
|
|
unset GOMP_CPU_AFFINITY
|
|
else
|
|
export GOMP_CPU_AFFINITY="${GOMP_CPU_AFFINITYsave}"
|
|
fi
|
|
else
|
|
|
|
export BLIS_JC_NT=1
|
|
export BLIS_PC_NT=1
|
|
export BLIS_IC_NT=1
|
|
export BLIS_JR_NT=1
|
|
export BLIS_IR_NT=1
|
|
export OMP_NUM_THREADS=1
|
|
export OPENBLAS_NUM_THREADS=1
|
|
export MKL_NUM_THREADS=1
|
|
export nt_use=1
|
|
fi
|
|
|
|
# Construct the name of the test executable.
|
|
exec_name="${exec_root}_${dt}${op}_${psize}_${im}_${suf}.x"
|
|
|
|
# Construct the name of the output file.
|
|
out_file="${out_root}_${suf}_${dt}${op}_${im}.m"
|
|
|
|
#echo "Running (nt = ${nt_use}) ./${exec_name} > ${out_file}"
|
|
echo "Running ./${exec_name} > ${out_file}"
|
|
|
|
# Run executable.
|
|
./${exec_name} > ${out_file}
|
|
|
|
sleep ${delay}
|
|
|
|
done
|
|
done
|
|
done
|
|
done
|
|
|