diff --git a/Jenkinsfile b/Jenkinsfile index 23821bd886..d990685289 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -352,6 +352,8 @@ def runCKProfiler(Map conf=[:]){ archiveArtifacts "perf_conv_bwd_data_${gpu_arch}.log" archiveArtifacts "perf_gemm_bilinear_${gpu_arch}.log" archiveArtifacts "perf_reduction_${gpu_arch}.log" + archiveArtifacts "perf_splitK_gemm_${gpu_arch}.log" + archiveArtifacts "perf_onnx_gemm_${gpu_arch}.log" // stash perf files to master stash name: "perf_gemm_${gpu_arch}.log" stash name: "perf_resnet50_N256_${gpu_arch}.log" @@ -362,6 +364,8 @@ def runCKProfiler(Map conf=[:]){ stash name: "perf_conv_bwd_data_${gpu_arch}.log" stash name: "perf_gemm_bilinear_${gpu_arch}.log" stash name: "perf_reduction_${gpu_arch}.log" + stash name: "perf_splitK_gemm_${gpu_arch}.log" + stash name: "perf_onnx_gemm_${gpu_arch}.log" //we will process results on the master node } else{ @@ -442,6 +446,8 @@ def process_results(Map conf=[:]){ unstash "perf_conv_bwd_data_${gpu_arch}.log" unstash "perf_gemm_bilinear_${gpu_arch}.log" unstash "perf_reduction_${gpu_arch}.log" + unstash "perf_splitK_gemm_${gpu_arch}.log" + unstash "perf_onnx_gemm_${gpu_arch}.log" sh "./process_qa_data.sh ${gpu_arch}" } else{ diff --git a/script/process_perf_data.py b/script/process_perf_data.py index b5f210e006..de1703cfc3 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -127,11 +127,16 @@ def parse_logfile(logfile): lst=line.split() res.append(lst[1]) #parse all other performance tests: - elif 'resnet50' or 'batched_gemm' or 'grouped_gemm' or 'conv_bwd_data' or 'gemm_bilinear' or 'reduction' in logfile: + elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile or 'conv_bwd_data' in logfile or 'gemm_bilinear' in logfile or 'reduction' in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() res.append(lst[4]) + elif 'onnx_gemm' in logfile or 'splitK_gemm' in logfile: + for line in open(logfile): + if 'Best Perf' in line: + lst=line.split() + res.append(lst[33]) return res @@ -281,6 +286,14 @@ def main(): for i in range(1,50): testlist.append("Layer%i"%i) table_name="ck_resnet50_N256_tflops" + if 'onnx_gemm' in filename: + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_onnx_gemm_tflops" + if 'splitK_gemm' in filename: + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_splitK_gemm_tflops" tflops_base = get_baseline(table_name,conn) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn) diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index fb2dbd5bb5..917305e916 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -2,8 +2,8 @@ # # in order to run this script you'd need the following python packages: -pip3 install --upgrade pip -pip3 install sqlalchemy pymysql pandas sshtunnel +#pip3 install --upgrade pip +#pip3 install sqlalchemy pymysql pandas sshtunnel # you would also need to set up some environment variables in order to # post your new test results to the database and compare them to the baseline @@ -20,3 +20,5 @@ python3 process_perf_data.py perf_conv_fwd_"$gpu_arch".log python3 process_perf_data.py perf_conv_bwd_data_"$gpu_arch".log python3 process_perf_data.py perf_gemm_bilinear_"$gpu_arch".log python3 process_perf_data.py perf_reduction_"$gpu_arch".log +python3 process_perf_data.py perf_splitK_gemm_"$gpu_arch".log +python3 process_perf_data.py perf_onnx_gemm_"$gpu_arch".log diff --git a/script/profile_onnx_gemm.sh b/script/profile_onnx_gemm.sh new file mode 100755 index 0000000000..c2721e7f59 --- /dev/null +++ b/script/profile_onnx_gemm.sh @@ -0,0 +1,31 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +echo $DRIVER +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 +# GEMM kernel benchmarks used by ONNX +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideC + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 768 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 2304 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 768 3072 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 3072 768 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 1024 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 3072 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 1024 4096 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 384 4096 1024 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 768 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 2304 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 768 3072 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 3072 768 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 1024 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 3072 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 1024 4096 -1 -1 -1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 24576 4096 1024 -1 -1 -1 + diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index be90d84c78..10b16ea114 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -40,99 +40,103 @@ function print_log_header(){ #run gemm tests export gemm_log="perf_gemm_${gpu_arch}.log" print_log_header $gemm_log $env_type $branch $host_name -./profile_gemm.sh gemm 0 0 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 1 0 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 2 0 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 3 0 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 0 1 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 1 1 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 2 1 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 3 1 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 0 2 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 1 2 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 2 2 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 3 2 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 0 3 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 1 3 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log -./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log +./profile_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 3 0 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 3 1 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 0 2 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 2 2 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 3 2 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 0 3 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 2 3 $verify 1 0 1 2>&1 | tee -a $gemm_log +./profile_gemm.sh gemm 3 3 $verify 1 0 1 2>&1 | tee -a $gemm_log #run batched_gemm tests export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log" print_log_header $batched_gemm_log $env_type $branch $host_name -./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 2 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 3 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 0 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 1 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 2 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 3 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 0 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 1 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 2 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 3 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 0 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 1 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 2 $verify 1 0 1 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log #run grouped_gemm tests export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log" print_log_header $grouped_gemm_log $env_type $branch $host_name -./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 1 0 1 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log #run GEMM+Bilinear tests export gemm_bilinear_log="perf_gemm_bilinear_${gpu_arch}.log" print_log_header $gemm_bilinear_log $env_type $branch $host_name -./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 | tee -a $gemm_bilinear_log -./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 | tee -a $gemm_bilinear_log -./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 | tee -a $gemm_bilinear_log -./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 | tee -a $gemm_bilinear_log +./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log +./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log +./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log +./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log #run conv_fwd tests export conv_fwd_log="perf_conv_fwd_${gpu_arch}.log" print_log_header $conv_fwd_log $env_type $branch $host_name -./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 | tee -a $conv_fwd_log -./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 | tee -a $conv_fwd_log -./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 | tee -a $conv_fwd_log -./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 | tee -a $conv_fwd_log +./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log +./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log +./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log +./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log #run conv_bwd_data tests export conv_bwd_data_log="perf_conv_bwd_data_${gpu_arch}.log" print_log_header $conv_bwd_data_log $env_type $branch $host_name -./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 | tee -a $conv_bwd_data_log -./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 | tee -a $conv_bwd_data_log -./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 | tee -a $conv_bwd_data_log -./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 | tee -a $conv_bwd_data_log +./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log +./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log +./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log +./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log #run resnet50 tests export resnet256_log="perf_resnet50_N256_${gpu_arch}.log" print_log_header $resnet256_log $env_type $branch $host_name -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 | tee -a $resnet256_log +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $resnet256_log export resnet4_log="perf_resnet50_N4_${gpu_arch}.log" print_log_header $resnet4_log $env_type $branch $host_name -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 | tee -a $resnet4_log +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 2>&1 | tee -a $resnet4_log #run reduction tests export reduction_log="perf_reduction_${gpu_arch}.log" print_log_header $reduction_log $env_type $branch $host_name -./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log -./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log +./profile_reduce_with_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log +./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log #run splitK_gemm tests export splitK_gemm_log="perf_splitK_gemm_${gpu_arch}.log" print_log_header $splitK_gemm_log $env_type $branch $host_name +./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log +./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 | tee -a $splitK_gemm_log - -../script/profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 | tee -a $splitK_gemm_log -../script/profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 | tee -a $splitK_gemm_log +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm_${gpu_arch}.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 2 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 2 0 1 2>&1 | tee -a $onnx_log