mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
Reorganize project folders (#6)
This commit is contained in:
17
script/check_copyright_year.sh
Executable file
17
script/check_copyright_year.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
current_year=$(date +%Y)
|
||||
exit_code=0
|
||||
|
||||
for file in $@; do
|
||||
if grep -q "Copyright (c)" $file
|
||||
then
|
||||
if ! grep -q "Copyright (c).*$current_year" $file
|
||||
then
|
||||
echo "ERROR: File $file has a copyright notice without the current year ($current_year)."
|
||||
exit_code=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
exit $exit_code
|
||||
2
script/clang-format-overwrite.sh
Executable file
2
script/clang-format-overwrite.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
|
||||
git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}'
|
||||
27
script/cmake-ck-dev.sh
Executable file
27
script/cmake-ck-dev.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=$1
|
||||
|
||||
if [ $# -ge 2 ] ; then
|
||||
GPU_TARGETS=$2
|
||||
shift 2
|
||||
REST_ARGS=$@
|
||||
else
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
REST_ARGS=
|
||||
fi
|
||||
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm/ \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D BUILD_DEV=ON \
|
||||
-D GPU_TARGETS=$GPU_TARGETS \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D USE_BITINT_EXTENSION_INT4=OFF \
|
||||
$REST_ARGS \
|
||||
${MY_PROJECT_SOURCE}
|
||||
28
script/cmake-ck-release.sh
Executable file
28
script/cmake-ck-release.sh
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
rm -f CMakeCache.txt
|
||||
rm -f *.cmake
|
||||
rm -rf CMakeFiles
|
||||
|
||||
MY_PROJECT_SOURCE=$1
|
||||
|
||||
if [ $# -ge 2 ] ; then
|
||||
GPU_TARGETS=$2
|
||||
shift 2
|
||||
REST_ARGS=$@
|
||||
else
|
||||
GPU_TARGETS="gfx908;gfx90a;gfx942"
|
||||
REST_ARGS=
|
||||
fi
|
||||
|
||||
cmake \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_CXX_FLAGS="-O3" \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D GPU_TARGETS=$GPU_TARGETS \
|
||||
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
|
||||
-D USE_BITINT_EXTENSION_INT4=OFF \
|
||||
$REST_ARGS \
|
||||
${MY_PROJECT_SOURCE}
|
||||
|
||||
413
script/convert_miopen_driver_to_profiler.py
Normal file
413
script/convert_miopen_driver_to_profiler.py
Normal file
@@ -0,0 +1,413 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Convert miopen driver command to ck Profiler
|
||||
# Example: python3 ../script/convert_miopen_driver_to_profiler.py
|
||||
# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
|
||||
# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
|
||||
def init_const_args(args):
|
||||
args.ck_profiler_cmd = '../build/bin/ckProfiler'
|
||||
# use decimal values
|
||||
args.init_method = 2
|
||||
# don't print tensor values
|
||||
args.log_value = 0
|
||||
|
||||
|
||||
def run_ck_profiler_cmd(cmd):
|
||||
print("ckProfiler command:")
|
||||
cmd_concatenated_str = ""
|
||||
for arg in cmd:
|
||||
cmd_concatenated_str += arg + " "
|
||||
print(cmd_concatenated_str)
|
||||
subprocess.run(cmd)
|
||||
|
||||
|
||||
def parse_layouts(args):
|
||||
if args.in_layout == "NCW" or args.in_layout == "NCHW" or \
|
||||
args.in_layout == "NCDHW":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight":
|
||||
args.layout = 4
|
||||
elif args.ck_profier_op == "grouped_conv_fwd" or \
|
||||
args.ck_profier_op == "grouped_conv_bwd_data":
|
||||
args.layout = 3
|
||||
else:
|
||||
print('Not supported layout for this op')
|
||||
exit(1)
|
||||
elif args.in_layout == "NWC" or args.in_layout == "NHWC" or \
|
||||
args.in_layout == "NDHWC":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight":
|
||||
args.layout = 2
|
||||
elif args.ck_profier_op == "grouped_conv_bwd_data" or \
|
||||
args.ck_profier_op == "grouped_conv_fwd":
|
||||
args.layout = 1
|
||||
else:
|
||||
print('Not supported layout for this op')
|
||||
exit(1)
|
||||
|
||||
|
||||
def parse_data_type(args):
|
||||
if args.data_type == "fp32":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
|
||||
args.ck_profier_op == "grouped_conv_bwd_data" or \
|
||||
args.ck_profier_op == "grouped_conv_fwd":
|
||||
args.data_type = 0
|
||||
if args.data_type == "fp16":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
|
||||
args.ck_profier_op == "grouped_conv_bwd_data" or \
|
||||
args.ck_profier_op == "grouped_conv_fwd":
|
||||
args.data_type = 1
|
||||
if args.data_type == "int8":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight":
|
||||
args.data_type = 4
|
||||
if args.ck_profier_op == "grouped_conv_bwd_data":
|
||||
print('Not supported data type for grouped_conv_bwd_data')
|
||||
exit(1)
|
||||
if args.ck_profier_op == "grouped_conv_fwd":
|
||||
args.data_type = 3
|
||||
if args.data_type == "bfp16":
|
||||
if args.ck_profier_op == "grouped_conv_bwd_weight":
|
||||
args.data_type = 5
|
||||
if args.ck_profier_op == "grouped_conv_bwd_data" or \
|
||||
args.ck_profier_op == "grouped_conv_fwd":
|
||||
args.data_type = 2
|
||||
|
||||
|
||||
def add_conv_params_to_cmd(args, cmd):
|
||||
if args.spatial_dim == 1:
|
||||
cmd += [str(args.fil_w), str(args.in_w)]
|
||||
cmd += [str(args.conv_stride_w), str(args.dilation_w)]
|
||||
cmd += [str(args.pad_w), str(args.pad_w)]
|
||||
elif args.spatial_dim == 2:
|
||||
cmd += [str(args.fil_h), str(args.fil_w)]
|
||||
cmd += [str(args.in_h), str(args.in_w)]
|
||||
cmd += [str(args.conv_stride_h), str(args.conv_stride_w)]
|
||||
cmd += [str(args.dilation_h), str(args.dilation_w)]
|
||||
cmd += [str(args.pad_h), str(args.pad_w)]
|
||||
cmd += [str(args.pad_h), str(args.pad_w)]
|
||||
elif args.spatial_dim == 3:
|
||||
cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)]
|
||||
cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
|
||||
cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
|
||||
cmd += [str(args.conv_stride_w)]
|
||||
cmd += [str(args.dilation_d),
|
||||
str(args.dilation_h),
|
||||
str(args.dilation_w)]
|
||||
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
|
||||
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
|
||||
else:
|
||||
print('Not supported spatial dim (supported: 1, 2, 3)')
|
||||
exit(1)
|
||||
|
||||
|
||||
def run_ck_grouped_conv_fwd(args):
|
||||
args.ck_profier_op = "grouped_conv_fwd"
|
||||
parse_data_type(args)
|
||||
parse_layouts(args)
|
||||
# use int32 by default
|
||||
args.index_type = 0
|
||||
|
||||
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
|
||||
cmd += [str(args.data_type), str(args.layout), str(args.index_type)]
|
||||
cmd += [str(args.verify), str(args.init_method)]
|
||||
cmd += [str(args.log_value), str(args.time)]
|
||||
cmd += [str(args.spatial_dim), str(args.group_count)]
|
||||
cmd += [str(args.batchsize), str(args.out_channels)]
|
||||
cmd += [str(args.in_channels)]
|
||||
add_conv_params_to_cmd(args, cmd)
|
||||
|
||||
run_ck_profiler_cmd(cmd)
|
||||
|
||||
|
||||
def run_ck_grouped_conv_bwd_data(args):
|
||||
args.ck_profier_op = "grouped_conv_bwd_data"
|
||||
parse_data_type(args)
|
||||
parse_layouts(args)
|
||||
# Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
|
||||
args.split_k_value = -1
|
||||
|
||||
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
|
||||
cmd += [str(args.data_type), str(args.layout)]
|
||||
cmd += [str(args.verify), str(args.init_method)]
|
||||
cmd += [str(args.log_value), str(args.time)]
|
||||
cmd += [str(args.spatial_dim), str(args.group_count)]
|
||||
cmd += [str(args.batchsize), str(args.out_channels)]
|
||||
cmd += [str(args.in_channels)]
|
||||
add_conv_params_to_cmd(args, cmd)
|
||||
|
||||
cmd += [str(args.split_k_value)]
|
||||
run_ck_profiler_cmd(cmd)
|
||||
|
||||
|
||||
def run_ck_grouped_conv_bwd_weight(args):
|
||||
args.ck_profier_op = "grouped_conv_bwd_weight"
|
||||
parse_data_type(args)
|
||||
parse_layouts(args)
|
||||
# Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
|
||||
args.split_k_value = -1
|
||||
|
||||
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
|
||||
cmd += [str(args.data_type), str(args.layout)]
|
||||
cmd += [str(args.verify), str(args.init_method)]
|
||||
cmd += [str(args.log_value), str(args.time)]
|
||||
cmd += [str(args.spatial_dim), str(args.group_count)]
|
||||
cmd += [str(args.batchsize), str(args.out_channels)]
|
||||
cmd += [str(args.in_channels)]
|
||||
add_conv_params_to_cmd(args, cmd)
|
||||
|
||||
cmd += [str(args.split_k_value)]
|
||||
run_ck_profiler_cmd(cmd)
|
||||
|
||||
# Get name of miopen driver, remove it from unknown
|
||||
def process_miopen_driver_name(args, unknown):
|
||||
if "convint8" in unknown:
|
||||
args.data_type = 'int8'
|
||||
unknown.remove("convint8")
|
||||
elif "convbfp16" in unknown:
|
||||
args.data_type = 'bfp16'
|
||||
unknown.remove("convbfp16")
|
||||
elif "convfp16" in unknown:
|
||||
args.data_type = 'fp16'
|
||||
unknown.remove("convfp16")
|
||||
elif "conv" in unknown:
|
||||
args.data_type = 'fp32'
|
||||
unknown.remove("conv")
|
||||
else:
|
||||
print('Not supported driver (supported: conv, convfp16, convint8,'
|
||||
' convbfp16).')
|
||||
exit(1)
|
||||
|
||||
|
||||
def run_ck_profiler(args):
|
||||
# MIOpen get number of channel per all groups, CK profiler get number of
|
||||
# channel per group
|
||||
args.in_channels = int(args.in_channels / args.group_count)
|
||||
args.out_channels = int(args.out_channels / args.group_count)
|
||||
|
||||
if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5:
|
||||
run_ck_grouped_conv_fwd(args)
|
||||
if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6:
|
||||
run_ck_grouped_conv_bwd_data(args)
|
||||
if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6:
|
||||
run_ck_grouped_conv_bwd_weight(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="converter",
|
||||
description="Convert miopen driver command to ck Profiler"
|
||||
"\nExample: python3 "
|
||||
"../script/convert_miopen_driver_to_profiler.py "
|
||||
"/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
|
||||
"-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
|
||||
"32 -F 1 -t 1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-in_layout",
|
||||
"-I",
|
||||
default="NCHW",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-forw",
|
||||
"-F",
|
||||
default=0,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Flag enables fwd, bwd, wrw convolutions"
|
||||
"\n0 fwd+bwd+wrw (default)"
|
||||
"\n1 fwd only"
|
||||
"\n2 bwd only"
|
||||
"\n4 wrw only"
|
||||
"\n3 fwd+bwd"
|
||||
"\n5 fwd+wrw"
|
||||
"\n6 bwd+wrw"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-spatial_dim",
|
||||
"-_",
|
||||
default=2,
|
||||
type=int,
|
||||
required=False,
|
||||
help="convolution spatial dimension (Default-2)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-batchsize",
|
||||
"-n",
|
||||
default=100,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Mini-batch size (Default=100)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-in_channels",
|
||||
"-c",
|
||||
default=3,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Number of Input Channels (Default=3)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-in_d",
|
||||
"-!",
|
||||
default=32,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Input Depth (Default=32)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-in_h",
|
||||
"-H",
|
||||
default=32,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Input Height (Default=32)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-in_w",
|
||||
"-W",
|
||||
default=32,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Input Width (Default=32)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-out_channels",
|
||||
"-k",
|
||||
default=32,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Number of Output Channels (Default=32)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-fil_d",
|
||||
"-@",
|
||||
default=3,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Filter Depth (Default=3)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-fil_h",
|
||||
"-y",
|
||||
default=3,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Filter Height (Default=3)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-fil_w",
|
||||
"-x",
|
||||
default=3,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Filter Width (Default=3)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-conv_stride_d",
|
||||
"-#",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Convolution Stride for Depth (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-conv_stride_h",
|
||||
"-u",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Convolution Stride for Height (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-conv_stride_w",
|
||||
"-v",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Convolution Stride for Width (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-pad_d",
|
||||
"-$",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Zero Padding for Depth (Default=0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-pad_h",
|
||||
"-p",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Zero Padding for Height (Default=0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-pad_w",
|
||||
"-q",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Zero Padding for Width (Default=0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-verify",
|
||||
"-V",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Verify Each Layer (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-time",
|
||||
"-t",
|
||||
default=0,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Time Each Layer (Default=0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-dilation_d",
|
||||
"-^",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Dilation of Filter Depth (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-dilation_h",
|
||||
"-l",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Dilation of Filter Height (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-dilation_w",
|
||||
"-j",
|
||||
default=1,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Dilation of Filter Width (Default=1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-group_count",
|
||||
"-g",
|
||||
type=int,
|
||||
default=1,
|
||||
required=False,
|
||||
help="Number of Groups (Default=1)"
|
||||
)
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
init_const_args(args)
|
||||
process_miopen_driver_name(args, unknown)
|
||||
print("Ignored args:")
|
||||
print(unknown)
|
||||
run_ck_profiler(args)
|
||||
20
script/count_vgpr.sh
Executable file
20
script/count_vgpr.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
FILE=$1
|
||||
|
||||
for num in {0..255}
|
||||
do
|
||||
base_pattern="(\[?${num}\b|\[\d*:${num}\])"
|
||||
spattern="s${base_pattern}"
|
||||
vpattern="v${base_pattern}"
|
||||
apattern="a${base_pattern}"
|
||||
scount=$(grep -P $spattern $FILE | wc -l)
|
||||
vcount=$(grep -P $vpattern $FILE | wc -l)
|
||||
acount=$(grep -P $apattern $FILE | wc -l)
|
||||
bash -c "echo -n v${num} $vcount && \
|
||||
echo -n , s${num} $scount && \
|
||||
echo -n , a${num} $acount"
|
||||
if [[ $scount -ne 0 || $vcount -ne 0 || $acount -ne 0 ]]; then
|
||||
echo -n " *"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
7
script/hip_fatbin_insert
Normal file
7
script/hip_fatbin_insert
Normal file
@@ -0,0 +1,7 @@
|
||||
SECTIONS {
|
||||
.hipFatBinSegment : { *(.hipFatBinSegment) }
|
||||
} INSERT AFTER .bss
|
||||
|
||||
SECTIONS {
|
||||
.hip_fatbin : { *(.hip_fatbin) }
|
||||
} INSERT AFTER .hipFatBinSegment
|
||||
25
script/hipclang_opt.sh
Executable file
25
script/hipclang_opt.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
rm *.ll *.s
|
||||
|
||||
BC_FILE=$1
|
||||
|
||||
/opt/rocm/llvm/bin/llvm-dis $BC_FILE -o original.ll
|
||||
/opt/rocm/llvm/bin/opt -S -inline -inline-threshold=104857 original.ll > inline.ll
|
||||
/opt/rocm/llvm/bin/opt -S -sroa inline.ll > sroa.ll
|
||||
/opt/rocm/llvm/bin/opt -S -O3 sroa.ll > o3.ll
|
||||
|
||||
/opt/rocm/llvm/bin/llc -mcpu=gfx906 original.ll
|
||||
/opt/rocm/llvm/bin/llc -mcpu=gfx906 inline.ll
|
||||
/opt/rocm/llvm/bin/llc -mcpu=gfx906 sroa.ll
|
||||
/opt/rocm/llvm/bin/llc -mcpu=gfx906 o3.ll
|
||||
|
||||
#/opt/rocm/llvm/bin/opt -S -O3 -sroa inline.ll > o3.ll
|
||||
#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3.ll > o3_2.ll
|
||||
#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_2.ll > o3_3.ll
|
||||
#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_3.ll > o3_4.ll
|
||||
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 opt.ll
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 inline.ll
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3.ll
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_2.ll
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_3.ll
|
||||
#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_4.ll
|
||||
20
script/install_precommit.sh
Executable file
20
script/install_precommit.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
run_and_check() {
|
||||
"$@"
|
||||
status=$?
|
||||
if [ $status -ne 0 ]; then
|
||||
echo "Error with \"$@\": Exited with status $status"
|
||||
exit $status
|
||||
fi
|
||||
return $status
|
||||
}
|
||||
|
||||
echo "I: Installing tools required for pre-commit checks..."
|
||||
run_and_check apt install clang-format-12
|
||||
|
||||
echo "I: Installing pre-commit itself..."
|
||||
run_and_check pip3 install pre-commit
|
||||
run_and_check pre-commit install
|
||||
|
||||
echo "I: Installation successful."
|
||||
382
script/process_perf_data.py
Normal file
382
script/process_perf_data.py
Normal file
@@ -0,0 +1,382 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, io, argparse, datetime
|
||||
#import numpy as np
|
||||
import sqlalchemy
|
||||
from sqlalchemy.types import NVARCHAR, Float, Integer
|
||||
from sqlalchemy import text
|
||||
import pymysql
|
||||
import pandas as pd
|
||||
from sshtunnel import SSHTunnelForwarder
|
||||
|
||||
def print_to_string(*args, **kwargs):
|
||||
output = io.StringIO()
|
||||
print(*args, file=output, **kwargs)
|
||||
contents = output.getvalue()
|
||||
output.close()
|
||||
return contents
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
|
||||
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
|
||||
args = parser.parse_args()
|
||||
files = []
|
||||
if os.path.isdir(args.filename):
|
||||
all_files = os.listdir(args.filename)
|
||||
for name in all_files:
|
||||
if not 'log' in name:
|
||||
continue
|
||||
files.append(os.path.join(args.filename, name))
|
||||
else:
|
||||
files = [args.filename]
|
||||
args.files = files
|
||||
return args
|
||||
|
||||
def get_log_params(logfile):
|
||||
print("logfile=",logfile)
|
||||
branch_name=' '
|
||||
node_id=' '
|
||||
gpu_arch=' '
|
||||
hip_vers=' '
|
||||
compute_units=0
|
||||
environment=' '
|
||||
rocm_vers=' '
|
||||
for line in open(logfile):
|
||||
if 'Branch name' in line:
|
||||
lst=line.split()
|
||||
branch_name=lst[2]
|
||||
if 'On branch' in line:
|
||||
lst=line.split()
|
||||
branch_name=lst[2]
|
||||
if 'Node name' in line:
|
||||
lst=line.split()
|
||||
node_id=lst[2]
|
||||
if 'GPU_arch' in line:
|
||||
lst=line.split()
|
||||
gpu_arch=lst[2]
|
||||
if 'HIP version' in line:
|
||||
lst=line.split()
|
||||
hip_vers=lst[2]
|
||||
if 'Compute Unit' in line:
|
||||
lst=line.split()
|
||||
compute_units=lst[2]
|
||||
if 'Environment type' in line:
|
||||
lst=line.split()
|
||||
environment=lst[2]
|
||||
if 'InstalledDir' in line:
|
||||
lst=line.split()
|
||||
rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
|
||||
return branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment
|
||||
|
||||
def parse_logfile(logfile):
|
||||
glue=''
|
||||
res=[]
|
||||
tests=[]
|
||||
kernels=[]
|
||||
tflops=[]
|
||||
dtype=[]
|
||||
alayout=[]
|
||||
blayout=[]
|
||||
M=[]
|
||||
N=[]
|
||||
K=[]
|
||||
StrideA=[]
|
||||
StrideB=[]
|
||||
StrideC=[]
|
||||
if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile:
|
||||
for line in open(logfile):
|
||||
if 'Best Perf' in line:
|
||||
lst=line.split()
|
||||
if len(lst)>=37: #the line is complete
|
||||
tests.append(glue.join(lst[5:30]))
|
||||
kernels.append(glue.join(lst[37:]))
|
||||
tflops.append(lst[33])
|
||||
dtype.append(lst[5])
|
||||
alayout.append(lst[8])
|
||||
blayout.append(lst[11])
|
||||
M.append(lst[14])
|
||||
N.append(lst[17])
|
||||
K.append(lst[20])
|
||||
StrideA.append(lst[23])
|
||||
StrideB.append(lst[26])
|
||||
StrideC.append(lst[29])
|
||||
elif len(lst)<37 and len(lst)>=33: #the tflops are available
|
||||
tests.append(glue.join(lst[5:30]))
|
||||
kernels.append("N/A")
|
||||
tflops.append(lst[33])
|
||||
dtype.append(lst[5])
|
||||
alayout.append(lst[8])
|
||||
blayout.append(lst[11])
|
||||
M.append(lst[14])
|
||||
N.append(lst[17])
|
||||
K.append(lst[20])
|
||||
StrideA.append(lst[23])
|
||||
StrideB.append(lst[26])
|
||||
StrideC.append(lst[29])
|
||||
print("warning: incomplete line:",lst)
|
||||
elif len(lst)<33: #even the tflops are not available
|
||||
print("Error in ckProfiler output!")
|
||||
print("warning: incomplete line=",lst)
|
||||
#sort results
|
||||
#sorted_tests = sorted(tests)
|
||||
res = [x for _,x in sorted(zip(tests,tflops))]
|
||||
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
|
||||
test_list=list(range(1,len(tests)+1))
|
||||
#parse conv_fwd and conv_bwd performance tests:
|
||||
elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'tflops:' in line:
|
||||
lst=line.split()
|
||||
res.append(lst[1])
|
||||
#parse all other performance tests:
|
||||
elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'Best Perf' in line:
|
||||
lst=line.split()
|
||||
res.append(lst[4])
|
||||
elif 'onnx_gemm' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'Best Perf' in line:
|
||||
lst=line.split()
|
||||
res.append(lst[33])
|
||||
elif 'splitK_gemm' in logfile or 'mixed_gemm' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'Best Perf' in line:
|
||||
lst=line.split()
|
||||
res.append(lst[36])
|
||||
elif 'perf_fmha' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'TFlops' in line:
|
||||
lst=line.split()
|
||||
line_dict=dict(zip(lst[1:],lst))
|
||||
res.append(line_dict['TFlops,'])
|
||||
elif 'perf_tile_gemm_basic' in logfile or 'perf_tile_gemm_mem_pipeline' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'TFlops' in line:
|
||||
lst=line.split()
|
||||
line_dict=dict(zip(lst[1:],lst))
|
||||
res.append(line_dict['TFlops,'])
|
||||
return res
|
||||
|
||||
|
||||
def get_baseline(table, connection):
|
||||
query = text('''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where Branch_ID='develop' );''')
|
||||
return pd.read_sql(query, connection)
|
||||
|
||||
def store_new_test_result(table_name, test_results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, connection):
|
||||
params=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(environment),str(datetime.datetime.now())]
|
||||
df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime'])
|
||||
df_add=pd.DataFrame(data=[test_results],columns=testlist)
|
||||
df=pd.concat([df,df_add],axis=1)
|
||||
#print("new test results dataframe:",df)
|
||||
df.to_sql(table_name,connection,if_exists='append',index=False)
|
||||
return 0
|
||||
|
||||
def compare_test_to_baseline(baseline,test,testlist):
|
||||
regression=0
|
||||
if not baseline.empty:
|
||||
base=baseline[testlist].to_numpy(dtype='float')
|
||||
base_list=base[0]
|
||||
ave_perf=0
|
||||
for i in range(len(base_list)):
|
||||
# success criterion:
|
||||
if base_list[i]>1.01*float(test[i]):
|
||||
print("test # ",i,"shows regression by {:.3f}%".format(
|
||||
(float(test[i])-base_list[i])/base_list[i]*100))
|
||||
regression=1
|
||||
if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
|
||||
if regression==0:
|
||||
print("no regressions found")
|
||||
ave_perf=ave_perf/len(base_list)
|
||||
print("average performance relative to baseline:",ave_perf)
|
||||
else:
|
||||
print("could not find a baseline")
|
||||
return regression
|
||||
|
||||
'''
|
||||
def post_test_params(tlist,connection):
|
||||
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
|
||||
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
|
||||
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
|
||||
sorted_M = [x for _,x in sorted(zip(tests,M))]
|
||||
sorted_N = [x for _,x in sorted(zip(tests,N))]
|
||||
sorted_K = [x for _,x in sorted(zip(tests,K))]
|
||||
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
|
||||
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
|
||||
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
|
||||
ck_gemm_params=[tlist,sorted_dtypes,sorted_alayout,sorted_blayout,
|
||||
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
|
||||
sorted_StrideC]
|
||||
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
|
||||
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
|
||||
print(df)
|
||||
|
||||
dtypes = {
|
||||
'Test_number': Integer(),
|
||||
'Data_type': NVARCHAR(length=5),
|
||||
'Alayout': NVARCHAR(length=12),
|
||||
'Blayout': NVARCHAR(length=12),
|
||||
'M': Integer(),
|
||||
'N': Integer(),
|
||||
'K': Integer(),
|
||||
'StrideA': Integer(),
|
||||
'StrideB': Integer(),
|
||||
'StrideC': Integer()
|
||||
}
|
||||
df.to_sql("ck_gemm_test_params",connection,if_exists='replace',index=False, dtype=dtypes)
|
||||
'''
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
results=[]
|
||||
tflops_base=[]
|
||||
testlist=[]
|
||||
#parse the test parameters from the logfile
|
||||
for filename in args.files:
|
||||
branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment = get_log_params(filename)
|
||||
|
||||
print("Branch name:",branch_name)
|
||||
print("Node name:",node_id)
|
||||
print("GPU_arch:",gpu_arch)
|
||||
print("Compute units:",compute_units)
|
||||
print("ROCM_version:",rocm_vers)
|
||||
print("HIP_version:",hip_vers)
|
||||
print("Environment:",environment)
|
||||
#parse results, get the Tflops value for "Best Perf" kernels
|
||||
results=parse_logfile(filename)
|
||||
|
||||
print("Number of tests:",len(results))
|
||||
sql_hostname = '127.0.0.1'
|
||||
sql_username = os.environ["dbuser"]
|
||||
sql_password = os.environ["dbpassword"]
|
||||
sql_main_database = os.environ["ck_perf_db"]
|
||||
sql_port = 3306
|
||||
ssh_host = os.environ["dbsship"]
|
||||
ssh_user = os.environ["dbsshuser"]
|
||||
ssh_port = int(os.environ["dbsshport"])
|
||||
ssh_pass = os.environ["dbsshpassword"]
|
||||
|
||||
with SSHTunnelForwarder(
|
||||
(ssh_host, ssh_port),
|
||||
ssh_username=ssh_user,
|
||||
ssh_password=ssh_pass,
|
||||
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
|
||||
|
||||
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
|
||||
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
|
||||
conn = sqlEngine.connect()
|
||||
|
||||
#save gemm performance tests:
|
||||
if 'perf_gemm' in filename and 'gemm_bilinear' not in filename:
|
||||
#write the ck_gemm_test_params table only needed once the test set changes
|
||||
#post_test_params(test_list,conn)
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_gemm_tflops"
|
||||
if 'batched_gemm' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_batched_gemm_tflops"
|
||||
if 'grouped_gemm' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_grouped_gemm_tflops"
|
||||
if 'perf_conv_fwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_conv_fwd_tflops"
|
||||
if 'perf_conv_bwd_data' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_conv_bwd_data_tflops"
|
||||
if 'grouped_conv_fwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_grouped_conv_fwd_tflops"
|
||||
if 'grouped_conv_bwd_data' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_grouped_conv_bwd_data_tflops"
|
||||
if 'grouped_conv_bwd_weight' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_grouped_conv_bwd_weight_tflops"
|
||||
if 'gemm_bilinear' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_gemm_bilinear_tflops"
|
||||
if 'reduction' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_reduction_GBps"
|
||||
if 'resnet50_N4' in filename:
|
||||
for i in range(1,50):
|
||||
testlist.append("Layer%i"%i)
|
||||
table_name="ck_resnet50_N4_tflops"
|
||||
if 'resnet50_N256' in filename:
|
||||
for i in range(1,50):
|
||||
testlist.append("Layer%i"%i)
|
||||
table_name="ck_resnet50_N256_tflops"
|
||||
if 'onnx_gemm' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_onnx_gemm_tflops"
|
||||
if 'splitK_gemm' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_splitK_gemm_tflops"
|
||||
if 'mixed_gemm' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_mixed_gemm_tflops"
|
||||
if 'fmha_fwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_fmha_fwd_tflops"
|
||||
if 'fmha_bwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_fmha_bwd_tflops"
|
||||
if 'gemm_basic_fp16' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_basic_fp16_tflops"
|
||||
if 'gemm_mem_pipeline_fp16' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_mem_pipeline_fp16_tflops"
|
||||
if 'gemm_basic_bf16' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_basic_bf16_tflops"
|
||||
if 'gemm_mem_pipeline_bf16' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_mem_pipeline_bf16_tflops"
|
||||
if 'gemm_basic_fp8' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_basic_fp8_tflops"
|
||||
if 'gemm_mem_pipeline_fp8' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_mem_pipeline_fp8_tflops"
|
||||
if 'gemm_basic_bf8' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_basic_bf8_tflops"
|
||||
if 'gemm_mem_pipeline_bf8' in filename:
|
||||
for i in range(1, len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_tile_gemm_mem_pipeline_bf8_tflops"
|
||||
|
||||
tflops_base = get_baseline(table_name,conn)
|
||||
store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine)
|
||||
conn.close()
|
||||
|
||||
#compare the results to the baseline if baseline exists
|
||||
regression=0
|
||||
regression=compare_test_to_baseline(tflops_base,results,testlist)
|
||||
return regression
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
54
script/process_perf_data.sh
Executable file
54
script/process_perf_data.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd need the following python packages:
|
||||
|
||||
#pip3 install --upgrade pip
|
||||
#pip3 install sqlalchemy pymysql pandas sshtunnel
|
||||
|
||||
# you would also need to set up some environment variables in order to
|
||||
# post your new test results to the database and compare them to the baseline
|
||||
# please contact Illia.Silin@amd.com for more details
|
||||
|
||||
#process results
|
||||
python3 process_perf_data.py perf_gemm.log
|
||||
python3 process_perf_data.py perf_onnx_gemm.log
|
||||
python3 process_perf_data.py perf_resnet50_N256.log
|
||||
python3 process_perf_data.py perf_resnet50_N4.log
|
||||
|
||||
file=./perf_onnx_gemm_gfx10.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx10.log
|
||||
fi
|
||||
file=./perf_onnx_gemm_gfx11.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx11.log
|
||||
fi
|
||||
file=./perf_onnx_gemm_gfx12.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx12.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
|
||||
fi
|
||||
|
||||
for gpu in "gfx90a" "gfx942"; do
|
||||
for dtype in "fp16" "bf16" "fp8" "bf8"; do
|
||||
file=./perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
|
||||
fi
|
||||
done
|
||||
done
|
||||
63
script/process_qa_data.sh
Executable file
63
script/process_qa_data.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd need the following python packages:
|
||||
|
||||
#pip3 install --upgrade pip
|
||||
#pip3 install sqlalchemy pymysql pandas sshtunnel
|
||||
|
||||
# you would also need to set up some environment variables in order to
|
||||
# post your new test results to the database and compare them to the baseline
|
||||
# please contact Illia.Silin@amd.com for more details
|
||||
|
||||
#process results
|
||||
python3 process_perf_data.py perf_gemm.log
|
||||
python3 process_perf_data.py perf_resnet50_N256.log
|
||||
python3 process_perf_data.py perf_resnet50_N4.log
|
||||
python3 process_perf_data.py perf_batched_gemm.log
|
||||
python3 process_perf_data.py perf_grouped_gemm.log
|
||||
python3 process_perf_data.py perf_grouped_conv_fwd.log
|
||||
python3 process_perf_data.py perf_grouped_conv_bwd_data.log
|
||||
python3 process_perf_data.py perf_grouped_conv_bwd_weight.log
|
||||
python3 process_perf_data.py perf_gemm_bilinear.log
|
||||
python3 process_perf_data.py perf_reduction.log
|
||||
python3 process_perf_data.py perf_splitK_gemm.log
|
||||
python3 process_perf_data.py perf_onnx_gemm.log
|
||||
python3 process_perf_data.py perf_mixed_gemm.log
|
||||
|
||||
file=./perf_onnx_gemm_gfx10.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx10.log
|
||||
fi
|
||||
file=./perf_onnx_gemm_gfx11.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx11.log
|
||||
fi
|
||||
file=./perf_onnx_gemm_gfx12.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_onnx_gemm_gfx12.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
|
||||
fi
|
||||
|
||||
for gpu in "gfx90a" "gfx942"; do
|
||||
for dtype in "fp16" "bf16" "fp8" "bf8"; do
|
||||
file=./perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_tile_gemm_mem_pipeline_${dtype}_${gpu}.log
|
||||
fi
|
||||
done
|
||||
done
|
||||
37
script/profile_batched_gemm.sh
Executable file
37
script/profile_batched_gemm.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 -1 -1 4
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 -1 -1 2
|
||||
|
||||
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096 -1 -1 -1 4
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192 -1 -1 -1 2
|
||||
|
||||
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 -1 -1 -1 4
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 -1 -1 -1 2
|
||||
|
||||
####### op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 -1 -1 -1 8
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 -1 -1 -1 4
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 -1 -1 -1 2
|
||||
58
script/profile_gemm.sh
Executable file
58
script/profile_gemm.sh
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
echo $DRIVER
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
|
||||
# 120 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 2048 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 1024 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1
|
||||
|
||||
# 104 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 1024 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 2048 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1664 1024 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1664 2048 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3328 4096 4096 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 6656 8192 8192 -1 -1 -1
|
||||
|
||||
# 110 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 1408 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560 2816 2048 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120 5632 4096 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040 8192 8192 -1 -1 -1
|
||||
|
||||
# testing different strides
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4096 4096 4096
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8192 8192 8192
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256
|
||||
41
script/profile_gemm_bilinear.sh
Executable file
41
script/profile_gemm_bilinear.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 1 1
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 0 -1 1 1
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000 1000 1000 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000 2000 2000 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000 4000 4000 -1 -1 0 -1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000 8000 8000 -1 -1 0 -1 1 1
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 1056 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 2080 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4128 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 8224 1 1
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 1088 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 2112 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4160 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 8256 1 1
|
||||
38
script/profile_grouped_conv_bwd_data.sh
Executable file
38
script/profile_grouped_conv_bwd_data.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
N=$8
|
||||
|
||||
# Resnet50
|
||||
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
|
||||
39
script/profile_grouped_conv_bwd_weight.sh
Executable file
39
script/profile_grouped_conv_bwd_weight.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
N=$8
|
||||
SplitK=$9
|
||||
|
||||
# Resnet50
|
||||
######## op datatype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $SplitK
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 $SplitK
|
||||
39
script/profile_grouped_conv_fwd.sh
Executable file
39
script/profile_grouped_conv_fwd.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
INDEXTYPE=$4
|
||||
VERIFY=$5
|
||||
INIT=$6
|
||||
LOG=$7
|
||||
TIME=$8
|
||||
|
||||
N=$9
|
||||
|
||||
# Resnet50
|
||||
######## op datatype indextype layout verify init log time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2 1 $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
|
||||
20
script/profile_grouped_conv_fwd_outelementop.sh
Executable file
20
script/profile_grouped_conv_fwd_outelementop.sh
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
OUTELEMENTOP=$3
|
||||
LAYOUT=$4
|
||||
VERIFY=$5
|
||||
INIT=$6
|
||||
LOG=$7
|
||||
TIME=$8
|
||||
|
||||
N=$9
|
||||
|
||||
####### op datatype OUTELEMENTOP layout verify init log time Ndims G N K C Z Y X Di Hi Wi Sz Sy Sx Dz Dy Dx Left Pz LeftPy LeftPx RightPz RightPy RightPx
|
||||
$DRIVER $OP $DATATYPE $OUTELEMENTOP $LAYOUT $VERIFY $INIT $LOG $TIME 3 32 $N 96 96 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
|
||||
$DRIVER $OP $DATATYPE $OUTELEMENTOP $LAYOUT $VERIFY $INIT $LOG $TIME 3 32 $N 192 192 3 3 3 28 28 28 1 1 1 1 1 1 1 1 1 1 1 1
|
||||
18
script/profile_grouped_gemm.sh
Executable file
18
script/profile_grouped_gemm.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
|
||||
######## op datatype layout verify init log time Ms______________ Ns______________ Ks_____________ StrideAs___________ StrideBs__________ StrideCs___________
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 256,512,1024,768 128,256,384,1024 128,192,256,512 1024,1025,1044,1026 1024,1024,1024,1024 1025,1024,1028,1024
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 512,768,2048,128 128,256,384,1024 128,192,256,512 1024,1025,2053,1026 1024,1024,1024,1024 1025,1024,2054,1024
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 256,512,1024,768 512,256,768,1024 128,192,256,512 1024,1045,1034,1026 1024,1024,1024,1024 1025,1063,1028,1024
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 512,768,4096,768 128,768,512,2048 128,192,256,512 1024,1027,4096,2050 1024,1024,1024,2048 1025,1024,4099,2049
|
||||
52
script/profile_mixed_gemm.sh
Executable file
52
script/profile_mixed_gemm.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
echo $DRIVER
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
KBatch=$8
|
||||
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch_
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 16 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 16 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 16 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 2048 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 2048 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 2048 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 8192 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 8192 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 16 8192 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 16 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 16 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 16 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 8192 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 8192 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 8192 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 16 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 16 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 16 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 2048 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 2048 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 2048 65536 -1 -1 -1 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 65536 -1 -1 -1 $KBatch
|
||||
|
||||
31
script/profile_onnx_gemm.sh
Executable file
31
script/profile_onnx_gemm.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
echo $DRIVER
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
# GEMM kernel benchmarks used by ONNX
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 768 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 2304 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 3072 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 3072 768 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 3072 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 4096 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 4096 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 768 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 2304 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 3072 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 3072 768 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 1024 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 3072 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 4096 -1 -1 -1
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 4096 1024 -1 -1 -1
|
||||
|
||||
43
script/profile_permute_scale.sh
Executable file
43
script/profile_permute_scale.sh
Executable file
@@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
echo $DRIVER
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
VERIFY=$3
|
||||
INIT=$4
|
||||
LOG=$5
|
||||
TIME=$6
|
||||
|
||||
|
||||
# 1D
|
||||
######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 67108864 0 0
|
||||
|
||||
# # 2D
|
||||
# ######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8192 8192 0 1 1 0
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8192 8192 1 0 0 1
|
||||
|
||||
# 3D
|
||||
######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 1024 8192 0 1 2 2 1 0
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 1024 8192 2 1 0 0 1 2
|
||||
|
||||
# 4D
|
||||
######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 512 8192 0 1 2 3 3 2 1 0
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 512 8192 3 2 1 0 0 1 2 3
|
||||
|
||||
# 5D
|
||||
######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 256 8192 0 1 2 3 4 4 3 2 1 0
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 256 8192 4 3 2 1 0 0 1 2 3 4
|
||||
|
||||
# 6D
|
||||
######## op datatype verify init log time dims in_strides_order out_strides_order
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 2 128 8192 0 1 2 3 4 5 5 4 3 2 1 0
|
||||
$DRIVER $OP $DATATYPE $VERIFY $INIT $LOG $TIME 8 2 2 2 128 8192 5 4 3 2 1 0 0 1 2 3 4 5
|
||||
|
||||
78
script/profile_reduce_no_index.sh
Executable file
78
script/profile_reduce_no_index.sh
Executable file
@@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
VERIFY="-v $1"
|
||||
INIT=$2
|
||||
NREPEAT=$3
|
||||
PRECISION=$4
|
||||
##PRECISION=--half
|
||||
##PRECISION=--double
|
||||
##PRECISION=--int8
|
||||
##PRECISION=--bf16
|
||||
|
||||
if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
|
||||
ACCTYPE="-C 1"
|
||||
elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
|
||||
ACCTYPE="-C 2"
|
||||
fi
|
||||
|
||||
#### 0 - ADD, 5 - AVG, 7 - NORM2
|
||||
Operations="0 5"
|
||||
|
||||
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
|
||||
if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" -o "$PRECISION" = "--half" ]; then
|
||||
Operations=5
|
||||
fi
|
||||
|
||||
## for generic validation
|
||||
for op in $Operations; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op acctype verify init repeats
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,22960 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,22960 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 4,1469440 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 4,1469440 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
set +x
|
||||
done
|
||||
|
||||
#### 0 - ADD, 5 - AVG, 7 - NORM2
|
||||
Operations=5
|
||||
|
||||
## for performance evaluation (resnet50 NHWC => C)
|
||||
for op in $Operations; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op acctype verify init repeats
|
||||
$DRIVER reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
|
||||
set +x
|
||||
done
|
||||
|
||||
70
script/profile_reduce_with_index.sh
Executable file
70
script/profile_reduce_with_index.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
VERIFY="-v $1"
|
||||
INIT=$2
|
||||
NREPEAT=$3
|
||||
PRECISION=$4
|
||||
##PRECISION=--half
|
||||
##PRECISION=--double
|
||||
##PRECISION=--int8
|
||||
##PRECISION=--bf16
|
||||
|
||||
#### 2 - MIN, 3 - MAX, 4 - AMAX
|
||||
Operations="2 4"
|
||||
|
||||
## for generic validation
|
||||
for op in $Operations; do
|
||||
for use_idx in 0 1; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op use index verify init repeats
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,22960 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,22960 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 4,1469440 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 4,1469440 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
set +x
|
||||
done
|
||||
done
|
||||
|
||||
Operations=2
|
||||
|
||||
## for performance evaluation (resnet50 NHWC => C)
|
||||
for op in $Operations; do
|
||||
for use_idx in 0 1; do
|
||||
set -x
|
||||
####### datatype layout reduce dims op use index verify init repeats
|
||||
$DRIVER reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
$DRIVER reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
|
||||
set +x
|
||||
done
|
||||
done
|
||||
|
||||
69
script/profile_resnet50.sh
Executable file
69
script/profile_resnet50.sh
Executable file
@@ -0,0 +1,69 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
IN_LAYOUT=$3
|
||||
WEI_LAYOUT=$4
|
||||
OUT_LAYOUT=$5
|
||||
VERIFY=$6
|
||||
INIT=$7
|
||||
LOG=$8
|
||||
TIME=$9
|
||||
|
||||
N=${10}
|
||||
|
||||
# Resnet50
|
||||
######## op____________________ datatype in_layout wei_layout out_layout verify init log time N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
$DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
|
||||
$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
|
||||
41
script/profile_splitK_gemm.sh
Executable file
41
script/profile_splitK_gemm.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
## GPU visibility
|
||||
export HIP_VISIBLE_DEVICES=0
|
||||
DRIVER="../build/bin/ckProfiler"
|
||||
echo $DRIVER
|
||||
OP=$1
|
||||
DATATYPE=$2
|
||||
LAYOUT=$3
|
||||
VERIFY=$4
|
||||
INIT=$5
|
||||
LOG=$6
|
||||
TIME=$7
|
||||
KBatch=$8
|
||||
|
||||
|
||||
# 120 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch_
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 2048 2048 -1 -1 -1 $KBatch
|
||||
|
||||
# 104 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch_
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 1024 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 832 2048 2048 -1 -1 -1 $KBatch
|
||||
|
||||
# 110 CU
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch_
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 1408 1024 -1 -1 -1 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280 2816 2048 -1 -1 -1 $KBatch
|
||||
|
||||
# testing different strides
|
||||
######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC KBatch_
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1024 1024 1024 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2048 2048 2048 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 $KBatch
|
||||
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 $KBatch
|
||||
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 $KBatch
|
||||
10
script/redis-cli.conf
Normal file
10
script/redis-cli.conf
Normal file
@@ -0,0 +1,10 @@
|
||||
fips = no
|
||||
setuid = root
|
||||
setgid = root
|
||||
pid = /var/run/stunnel.pid
|
||||
debug = 7
|
||||
options = NO_SSLv2
|
||||
options = NO_SSLv3
|
||||
[redis-cli]
|
||||
client = yes
|
||||
accept = 127.0.0.1:6379
|
||||
151
script/run_full_performance_tests.sh
Executable file
151
script/run_full_performance_tests.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
|
||||
# you would also need to set up some environment variables in order to
|
||||
# post your new test results to the database and compare them to the baseline
|
||||
# please contact Illia.Silin@amd.com for more details
|
||||
#
|
||||
# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
|
||||
# input arguments:
|
||||
# verification = 0 : do not verify result correctness on CPU
|
||||
# = 1 : verifuy correctness on CPU (may take a long time)
|
||||
# environment tag : a string describing the specifics of your test environment
|
||||
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
|
||||
# node name : $hostname
|
||||
|
||||
#get the command line arguments:
|
||||
export verify=$1
|
||||
echo 'Verification: ' $verify
|
||||
export env_type=$2
|
||||
echo 'Environment type: ' $env_type
|
||||
export branch=$3
|
||||
echo 'Branch name: ' $branch
|
||||
export host_name=$4
|
||||
echo 'Host name: ' $host_name
|
||||
function print_log_header(){
|
||||
rm -f $1;
|
||||
echo 'On branch ' $3 &> $1;
|
||||
echo 'Node name: ' $4 >> $1;
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
|
||||
rocminfo | grep "Compute Unit:" >> $1;
|
||||
hipcc --version | grep -e 'HIP version' >> $1;
|
||||
echo 'Environment type: ' $2 >> $1;
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
|
||||
}
|
||||
|
||||
#run gemm tests
|
||||
export gemm_log="perf_gemm.log"
|
||||
print_log_header $gemm_log $env_type $branch $host_name
|
||||
./profile_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
|
||||
|
||||
#run batched_gemm tests
|
||||
export batched_gemm_log="perf_batched_gemm.log"
|
||||
print_log_header $batched_gemm_log $env_type $branch $host_name
|
||||
./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 0 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 0 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 2 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 2 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 2 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 2 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 3 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 3 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 3 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
|
||||
|
||||
#run grouped_gemm tests
|
||||
export grouped_gemm_log="perf_grouped_gemm.log"
|
||||
print_log_header $grouped_gemm_log $env_type $branch $host_name
|
||||
./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
|
||||
./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
|
||||
./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
|
||||
./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
|
||||
|
||||
#run GEMM+Bilinear tests
|
||||
export gemm_bilinear_log="perf_gemm_bilinear.log"
|
||||
print_log_header $gemm_bilinear_log $env_type $branch $host_name
|
||||
./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
|
||||
./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
|
||||
./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
|
||||
./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
|
||||
|
||||
#run grouped_fwd tests
|
||||
export grouped_conv_fwd_log="perf_grouped_conv_fwd.log"
|
||||
print_log_header $grouped_conv_fwd_log $env_type $branch $host_name
|
||||
./profile_grouped_conv_fwd.sh grouped_conv_fwd 0 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
|
||||
./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
|
||||
./profile_grouped_conv_fwd.sh grouped_conv_fwd 2 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
|
||||
|
||||
#run grouped_bwd_data tests
|
||||
export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data.log"
|
||||
print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
|
||||
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
|
||||
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
|
||||
./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
|
||||
|
||||
#run grouped_bwd_weight tests
|
||||
export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
|
||||
print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
|
||||
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 0 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
|
||||
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
|
||||
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 2 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
|
||||
./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 4 2>&1 | tee -a $grouped_conv_bwd_weight_log
|
||||
|
||||
#run resnet50 tests
|
||||
export resnet256_log="perf_resnet50_N256.log"
|
||||
print_log_header $resnet256_log $env_type $branch $host_name
|
||||
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $resnet256_log
|
||||
export resnet4_log="perf_resnet50_N4.log"
|
||||
print_log_header $resnet4_log $env_type $branch $host_name
|
||||
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 2>&1 | tee -a $resnet4_log
|
||||
|
||||
#run reduction tests
|
||||
export reduction_log="perf_reduction.log"
|
||||
print_log_header $reduction_log $env_type $branch $host_name
|
||||
./profile_reduce_with_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
|
||||
./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
|
||||
|
||||
#run splitK_gemm tests, first correctness verification, then performance
|
||||
export splitK_gemm_log="perf_splitK_gemm.log"
|
||||
print_log_header $splitK_gemm_log $env_type $branch $host_name
|
||||
./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
|
||||
|
||||
#run ONNX gemm tests
|
||||
export onnx_log="perf_onnx_gemm.log"
|
||||
print_log_header $onnx_log $env_type $branch $host_name
|
||||
./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
|
||||
#run mixed fp16/fp8 and fp8/fp16 gemm tests
|
||||
export mixed_gemm_log="perf_mixed_gemm.log"
|
||||
print_log_header $mixed_gemm_log $env_type $branch $host_name
|
||||
./profile_mixed_gemm.sh gemm_splitk 4 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log
|
||||
./profile_mixed_gemm.sh gemm_splitk 5 0 $verify 2 0 1 16 2>&1 | tee -a $mixed_gemm_log
|
||||
41
script/run_gemm_performance_tests.sh
Executable file
41
script/run_gemm_performance_tests.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
|
||||
# run the script as "./run_gemm_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name> <arch>
|
||||
# input arguments:
|
||||
# verification = 0 : do not verify result correctness on CPU
|
||||
# = 1 : verify correctness on CPU (may take a long time)
|
||||
# environment tag : a string describing the specifics of your test environment
|
||||
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
|
||||
# node name : $hostname
|
||||
# arch : GPU architecture, e.g. "gfx9" or "gfx1100"
|
||||
|
||||
#get the command line arguments:
|
||||
export verify=$1
|
||||
echo 'Verification: ' $verify
|
||||
export env_type=$2
|
||||
echo 'Environment type: ' $env_type
|
||||
export branch=$3
|
||||
echo 'Branch name: ' $branch
|
||||
export host_name=$4
|
||||
echo 'Host name: ' $host_name
|
||||
export arch=$5
|
||||
echo 'GPU architecture: ' $arch
|
||||
|
||||
function print_log_header(){
|
||||
rm -f $1;
|
||||
echo 'On branch ' $3 &> $1;
|
||||
echo 'Node name: ' $4 >> $1;
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
|
||||
rocminfo | grep "Compute Unit:" >> $1;
|
||||
hipcc --version | grep -e 'HIP version' >> $1;
|
||||
echo 'Environment type: ' $2 >> $1;
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
|
||||
}
|
||||
|
||||
#run ONNX gemm tests
|
||||
export onnx_log="perf_onnx_gemm_$arch.log"
|
||||
print_log_header $onnx_log $env_type $branch $host_name
|
||||
./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
66
script/run_performance_tests.sh
Executable file
66
script/run_performance_tests.sh
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
|
||||
# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
|
||||
# input arguments:
|
||||
# verification = 0 : do not verify result correctness on CPU
|
||||
# = 1 : verify correctness on CPU (may take a long time)
|
||||
# environment tag : a string describing the specifics of your test environment
|
||||
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
|
||||
# node name : $hostname
|
||||
|
||||
#get the command line arguments:
|
||||
export verify=$1
|
||||
echo 'Verification: ' $verify
|
||||
export env_type=$2
|
||||
echo 'Environment type: ' $env_type
|
||||
export branch=$3
|
||||
echo 'Branch name: ' $branch
|
||||
export host_name=$4
|
||||
echo 'Host name: ' $host_name
|
||||
|
||||
function print_log_header(){
|
||||
rm -f $1;
|
||||
echo 'On branch ' $3 &> $1;
|
||||
echo 'Node name: ' $4 >> $1;
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
|
||||
rocminfo | grep "Compute Unit:" >> $1;
|
||||
hipcc --version | grep -e 'HIP version' >> $1;
|
||||
echo 'Environment type: ' $2 >> $1;
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
|
||||
}
|
||||
|
||||
#run gemm tests
|
||||
export gemm_log="perf_gemm.log"
|
||||
print_log_header $gemm_log $env_type $branch $host_name
|
||||
./profile_gemm.sh gemm 0 0 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 0 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 0 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 0 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 1 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 1 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 1 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 1 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 2 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 2 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 2 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 2 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 3 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 3 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
|
||||
|
||||
#run ONNX gemm tests
|
||||
export onnx_log="perf_onnx_gemm.log"
|
||||
print_log_header $onnx_log $env_type $branch $host_name
|
||||
./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
|
||||
|
||||
#run resnet50 tests
|
||||
export resnet256_log="perf_resnet50_N256.log"
|
||||
print_log_header $resnet256_log $env_type $branch $host_name
|
||||
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 | tee -a $resnet256_log
|
||||
export resnet4_log="perf_resnet50_N4.log"
|
||||
print_log_header $resnet4_log $env_type $branch $host_name
|
||||
./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 | tee -a $resnet4_log
|
||||
56
script/sccache_wrapper.sh
Executable file
56
script/sccache_wrapper.sh
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
COMPILERS_HASH_DIR=${COMPILERS_HASH_DIR:-"/tmp/.sccache"}
|
||||
SCCACHE_EXTRAFILES=${SCCACHE_EXTRAFILES:-"${COMPILERS_HASH_DIR}/rocm_compilers_hash_file"}
|
||||
SCCACHE_BIN=${SCCACHE_BIN:-"${SCCACHE_INSTALL_LOCATION}/sccache"}
|
||||
ENFORCE_REDIS="false"
|
||||
while [ "$1" != "" ];
|
||||
do
|
||||
case $1 in
|
||||
--enforce_redis )
|
||||
shift; ENFORCE_REDIS="true" ;;
|
||||
--no-hipcc )
|
||||
shift ;;
|
||||
*)
|
||||
break ;;
|
||||
esac
|
||||
done
|
||||
setup_rocm_compilers_hash_file() {
|
||||
mkdir -p "$COMPILERS_HASH_DIR"
|
||||
HIPCC_MD5="$(md5sum "${ROCM_PATH}/bin/hipcc")"
|
||||
pushd "${ROCM_PATH}/amdgcn/bitcode"
|
||||
DEVICELIBS_BITCODES_MD5="$(find . -type f -exec md5sum {} \; | sort | md5sum)"
|
||||
popd
|
||||
HIPCC_HASH_VALUE="${HIPCC_MD5%% *}"
|
||||
DEVICELIBS_BITCODES_HASH_VALUE="${DEVICELIBS_BITCODES_MD5%% *}"
|
||||
# MD5 checksums of clang and clang-offload-bundler cannot be used since they will keep changing
|
||||
# if the ROCM_PATH changes, ie; for every mainline build.
|
||||
# This is because ROCM_PATH gets encoded into the clang/clang-offload-bundler binaries as part
|
||||
# of RPATH.
|
||||
# The versions themselves contain the commit hash of the compiler repo at the time of building.
|
||||
# Hence, this should be a viable alternative to using the binary checksum itself.
|
||||
CLANG_VERSION="$("${ROCM_PATH}/llvm/bin/clang" --version | head -n 1)"
|
||||
CLANG_OFFLOAD_BUNDLER_VERSION="$("${ROCM_PATH}/llvm/bin/clang-offload-bundler" --version | head -n 1)"
|
||||
printf '%s: %s\n' 'clang version' "${CLANG_VERSION}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'clang-offload-bundler version' "${CLANG_OFFLOAD_BUNDLER_VERSION}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'hipcc md5sum' "${HIPCC_HASH_VALUE}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
printf '%s: %s\n' 'devicelibs bitcode md5sum' "${DEVICELIBS_BITCODES_HASH_VALUE}" | tee -a "$SCCACHE_EXTRAFILES"
|
||||
echo "sccache-wrapper: compilers hash file set up at ${SCCACHE_EXTRAFILES}"
|
||||
cat "$SCCACHE_EXTRAFILES"
|
||||
}
|
||||
if [ "${ENFORCE_REDIS}" == "true" ]; then
|
||||
if [ -z "${SCCACHE_REDIS}" ]; then
|
||||
echo "SCCACHE_REDIS not set. Not wrapping compilers with sccache."
|
||||
exit 10
|
||||
else
|
||||
response=$(redis-cli -u ${SCCACHE_REDIS} ping) || true
|
||||
if [ "${response}" != "PONG" ]; then
|
||||
echo "Redis server unreachable. Not wrapping compilers with sccache."
|
||||
exit 20
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
setup_rocm_compilers_hash_file
|
||||
$SCCACHE_BIN --version
|
||||
$SCCACHE_BIN --start-server
|
||||
|
||||
110
script/test_convnd_fwd.sh
Normal file
110
script/test_convnd_fwd.sh
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# set -e
|
||||
|
||||
DIM1=False
|
||||
DIM2=True
|
||||
DIM3=False
|
||||
DATE=220317
|
||||
GIT_HASH=4e6dfda
|
||||
LOG_DIR=${DATE}_${GIT_HASH}
|
||||
SUFFIX=${GIT_HASH}
|
||||
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
# Commandline arguments parsing
|
||||
# like: cmd -key[--key] value
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
POSITIONAL=()
|
||||
while [[ $# -gt 0 ]]
|
||||
do
|
||||
key="$1"
|
||||
|
||||
case $key in
|
||||
-d1|--d1)
|
||||
DIM1=True
|
||||
echo DIM1: "${DIM1}"
|
||||
shift # past argument
|
||||
;;
|
||||
-d2|--d2)
|
||||
DIM2=True
|
||||
echo DIM2: "${DIM2}"
|
||||
shift # past argument
|
||||
;;
|
||||
-d3|--d3)
|
||||
DIM3=True
|
||||
echo DIM3: "${DIM3}"
|
||||
shift # past argument
|
||||
;;
|
||||
-all|--all)
|
||||
DIM1=True
|
||||
DIM2=True
|
||||
DIM3=True
|
||||
echo DIM1: "${DIM1}"
|
||||
echo DIM2: "${DIM2}"
|
||||
echo DIM3: "${DIM3}"
|
||||
shift # past argument
|
||||
;;
|
||||
-s|--suffix)
|
||||
SUFFIX=${SUFFIX}_"$2"
|
||||
echo SUFFIX: "${SUFFIX}"
|
||||
shift # past argument
|
||||
shift # past value
|
||||
;;
|
||||
*) # unknown option
|
||||
POSITIONAL+=("$1") # save it in an array for later
|
||||
shift # past argument
|
||||
;;
|
||||
esac
|
||||
done
|
||||
set -- "${POSITIONAL[@]}" # restore positional parameters
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
# NUMACTL="numactl --cpunodebind=1 --membind=1"
|
||||
NUMACTL=
|
||||
# ENV_CONF=
|
||||
GPU=gfx908
|
||||
PROF_ITER_COUNT=10000
|
||||
LOG_DIR_PATH=../log/${LOG_DIR}
|
||||
set -x
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# 1D
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
if [[ "${DIM1}" == "True" ]]; then
|
||||
mkdir -p ${LOG_DIR_PATH}
|
||||
echo ">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
|
||||
CMD="./../build/bin/test_conv1d_fwd"
|
||||
${NUMACTL} ${CMD} 2>&1 \
|
||||
| tee ${LOG_DIR_PATH}/test_conv1d_fwd_nwc_${SUFFIX}_${GPU}.log
|
||||
|
||||
fi
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# 2D
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
if [[ "${DIM2}" == "True" ]]; then
|
||||
mkdir -p ${LOG_DIR_PATH}
|
||||
echo ">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
|
||||
CMD="./../build/bin/test_conv2d_fwd"
|
||||
${NUMACTL} ${CMD} 2>&1 \
|
||||
| tee ${LOG_DIR_PATH}/test_conv2d_fwd_nhwc_${SUFFIX}_${GPU}.log
|
||||
|
||||
fi
|
||||
|
||||
#-------------------------------------------------------------------------------
|
||||
# 3D
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
if [[ "${DIM3}" == "True" ]]; then
|
||||
mkdir -p ${LOG_DIR_PATH}
|
||||
echo ">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
|
||||
CMD="./../build/bin/test_conv3d_fwd"
|
||||
${NUMACTL} ${CMD} 2>&1 \
|
||||
| tee ${LOG_DIR_PATH}/test_conv3d_fwd_ndhwc_${SUFFIX}_${GPU}.log
|
||||
|
||||
fi
|
||||
63
script/test_reduce_no_index.sh
Executable file
63
script/test_reduce_no_index.sh
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
|
||||
## The following will be used for CI
|
||||
|
||||
set -x
|
||||
|
||||
## for float
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
|
||||
|
||||
## for float64
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 6 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 6 2
|
||||
|
||||
## for float16
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 1 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 1 2
|
||||
|
||||
## for int8_t
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 3 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 3 2
|
||||
|
||||
## for bfloat16
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 0 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 1 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 2 5 2
|
||||
bin/test_reduce_no_index -D 64,4,280,82 -R 3 5 2
|
||||
|
||||
set +x
|
||||
|
||||
1
script/uninstall_precommit.sh
Executable file
1
script/uninstall_precommit.sh
Executable file
@@ -0,0 +1 @@
|
||||
pre-commit uninstall
|
||||
Reference in New Issue
Block a user