Add full QA with verification option, few other changes. (#331)

* add verify flag and update scripts * replace old check_error function with the new check_err * fix syntax * remove blank spaces * remove empty line * add check_err for tensors * fix syntax * replace tensors with vectors in check_err calls * fix syntax * remove blank spaces * fix syntax * add new line at end of file * disable conv2d_bwd_weight test, add gpu check * set check_gpu using export * check GPU using runShell * add definition of runShell * fix script syntax * reduce the number of threads, add full qa option * run processing scripts in bash * fix the branch and host names in performance scripts, add chronos * replace parameterizedCron with cron * archive the perf log files * try to fix git call * pass branch and host names as arguments into scripts * fix script arguments * fix script arguments * process results on master * fix pipeline * add definition of gpu_arch * run processing scripts in docker * fix the brackets * add agent master for the processing stage * get rid of show_node_info call on master * try using mici label instead of master, disable MI100 tests for now * fix syntax * simplify container for results processing * remove node(master) from the process_results stage * put all stages in original order * change the agent label from master to mici for gfx908 [ROCm/composable_kernel commit: d8415a96b3]
2026-05-14 02:02:46 +00:00 · 2022-07-21 13:25:46 -07:00
parent cdb627bf1b
commit 6f6ae03ad8
16 changed files with 464 additions and 330 deletions
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -85,7 +85,6 @@ def parse_logfile(logfile):
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
-                print("len(lst)=",len(lst),"lst:",lst)
                if len(lst)>=37: #the line is complete
                    tests.append(glue.join(lst[5:30]))
                    kernels.append(glue.join(lst[37:]))
@@ -293,4 +292,4 @@ def main():
    return regression

 if __name__ == '__main__':
-    main()
+    main()
--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+pip3 install --upgrade pip
+pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+gpu_arch=$1
+python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -0,0 +1,22 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+pip3 install --upgrade pip
+pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+gpu_arch=$1
+python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
+python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_fwd_conv_"$gpu_arch".log
+python3 process_perf_data.py perf_bwd_conv_"$gpu_arch".log
+python3 process_perf_data.py perf_fusion_"$gpu_arch".log
+python3 process_perf_data.py perf_reduction_"$gpu_arch".log
--- a/script/profile_batched_gemm.sh
+++ b/script/profile_batched_gemm.sh
@@ -11,26 +11,34 @@ INIT=$5
 LOG=$6
 REPEAT=$7
 
-########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  960  1024 1024       -1     -1      -1            8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920  2048 2048       -1     -1      -1            8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840  4096 4096       -1     -1      -1            4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680  8192 8192       -1     -1      -1            2
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+REPEAT=$7
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1024    1024    1024           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2048    2048    2048           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4096    4096    4096           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8192    8192    8192           2
+########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  960  1024 1024       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920  2048 2048       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840  4096 4096       -1     -1      -1            -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680  8192 8192       -1     -1      -1            -1           -1           -1          2
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1056    1056    1056           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2080    2080    2080           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4128    4128    4128           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8224    8224    8224           2
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1024    1024    1024           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2048    2048    2048           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4096    4096    4096           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8192    8192    8192           -1           -1           -1          2
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC    BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1088    1088    1088           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2112    2112    2112           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4160    4160    4160           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8256    8256    8256           2
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1056    1056    1056           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2080    2080    2080           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4128    4128    4128           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8224    8224    8224           -1           -1           -1          2
+ 
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1088    1088    1088           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2112    2112    2112           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4160    4160    4160           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8256    8256    8256           -1           -1           -1          2
--- a/script/profile_gemm_bilinear.sh
+++ b/script/profile_gemm_bilinear.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1      -1      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000  1000 1000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000  2000 2000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000  4000 4000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000  8000 8000       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1056    1056    1056    1056     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2080    2080    2080    2080     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4128    4128    4128    4128     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8224    8224    8224    8224     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1088    1088    1088    1088     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2112    2112    2112    2112     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4160    4160    4160    4160     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8256    8256    8256    8256     1    1
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -1,124 +1,124 @@
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# and make sure the following python packages are installed in your environment:
-
-pip3 install --upgrade pip
-pip3 install sqlalchemy pymysql pandas sshtunnel
-
 # you would also need to set up some environment variables in order to 
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <tag for your test environment>
-
-#get the test environment type:
-export env_type=$1
-echo 'Environment type ' $env_type
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verifuy correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname

+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export gpu_arch=$3
+echo 'GPU architecture: ' $gpu_arch
+export branch=$4
+echo 'Branch name: ' $branch
+export host_name=$5
+echo 'Host name: ' $host_name
 function print_log_header(){
 	rm -f $1;
-	git status | grep -e 'On branch' > $1;
-	echo -n 'Node name: ' >>$1; hostname >> $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
 	#get GPU_arch and number of compute units from rocminfo
 	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
 	rocminfo | grep "Compute Unit:" >> $1;
 	hipcc --version | grep -e 'HIP version'  >> $1;
-	echo 'Environment type: ' $2 >>$1;
+	echo 'Environment type: ' $2 >> $1;
 	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
 }

 #run gemm tests
-export gemm_log="perf_gemm.log"
-print_log_header $gemm_log $env_type
-./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
-python3 process_perf_data.py $gemm_log
+export gemm_log="perf_gemm_${gpu_arch}.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log

 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256.log"
-print_log_header $resnet256_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
-python3 process_perf_data.py $resnet256_log
-export resnet4_log="perf_resnet50_N4.log"
-print_log_header $resnet4_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
-python3 process_perf_data.py $resnet4_log
+export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log

 #run batched_gemm tests
-export batched_gemm_log="perf_batched_gemm.log"
-print_log_header $batched_gemm_log $env_type
-./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 | tee -a $batched_gemm_log
-python3 process_perf_data.py $batched_gemm_log
+export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
+print_log_header $batched_gemm_log $env_type $branch $host_name
+./profile_batched_gemm.sh batched_gemm 0 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 3 $verify 2 0 5 | tee -a $batched_gemm_log

 #run grouped_gemm tests
-export grouped_gemm_log="perf_grouped_gemm.log"
-print_log_header $grouped_gemm_log $env_type
-./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 | tee -a $grouped_gemm_log
-python3 process_perf_data.py $grouped_gemm_log
+export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
+print_log_header $grouped_gemm_log $env_type $branch $host_name
+./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 2 0 5 | tee -a $grouped_gemm_log

 #run fwd_conv tests
-export fwd_conv_log="perf_fwd_conv.log"
-print_log_header $fwd_conv_log $env_type
-./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-python3 process_perf_data.py $fwd_conv_log
+export fwd_conv_log="perf_fwd_conv_${gpu_arch}.log"
+print_log_header $fwd_conv_log $env_type $branch $host_name
+./profile_conv.sh conv_fwd 0 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 1 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 2 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 3 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log

 #run bwd_conv tests
-export bwd_conv_log="perf_bwd_conv.log"
-print_log_header $bwd_conv_log $env_type
-./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-python3 process_perf_data.py $bwd_conv_log
+export bwd_conv_log="perf_bwd_conv_${gpu_arch}.log"
+print_log_header $bwd_conv_log $env_type $branch $host_name
+./profile_conv.sh conv2d_bwd_data 0 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 1 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 2 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 3 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log

 #run fusion tests
-export fusion_log="perf_fusion.log"
-print_log_header $fusion_log $env_type
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 | tee -a $fusion_log
-python3 process_perf_data.py $fusion_log
+export fusion_log="perf_fusion_${gpu_arch}.log"
+print_log_header $fusion_log $env_type $branch $host_name
+./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 2 0 1 | tee -a $fusion_log

 #run reduction tests
-export reduction_log="perf_reduction.log"
-print_log_header $reduction_log $env_type
-./profile_reduce_with_index.sh 0 2 10 --half | tee -a $reduction_log
-./profile_reduce_no_index.sh 0 2 10 --half | tee -a $reduction_log
-python3 process_perf_data.py $reduction_log
+export reduction_log="perf_reduction_${gpu_arch}.log"
+print_log_header $reduction_log $env_type $branch $host_name
+./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log
+./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -1,59 +1,62 @@
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# and make sure the following python packages are installed in your environment:
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verify correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname

-pip3 install --upgrade pip
-pip3 install sqlalchemy pymysql pandas sshtunnel
-
-# you would also need to set up some environment variables in order to 
-# post your new test results to the database and compare them to the baseline
-# please contact Illia.Silin@amd.com for more details
-#
-# run the script as "./run_performance_tests.sh <tag for your test environment>
-
-#get the test environment type:
-export env_type=$1
-echo 'Environment type ' $env_type
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export gpu_arch=$3
+echo 'GPU architecture: ' $gpu_arch
+export branch=$4
+echo 'Branch name: ' $branch
+export host_name=$5
+echo 'Host name: ' $host_name

 function print_log_header(){
 	rm -f $1;
-	git status | grep -e 'On branch' > $1;
-	echo -n 'Node name: ' >>$1; hostname >> $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
 	#get GPU_arch and number of compute units from rocminfo
 	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
 	rocminfo | grep "Compute Unit:" >> $1;
 	hipcc --version | grep -e 'HIP version'  >> $1;
-	echo 'Environment type: ' $2 >>$1;
+	echo 'Environment type: ' $2 >> $1;
 	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
 }
 #run gemm tests
-export gemm_log="perf_gemm.log"
-print_log_header $gemm_log $env_type
-./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
-python3 process_perf_data.py $gemm_log
+export gemm_log="perf_gemm_${gpu_arch}.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log

 #run resnet50 test
-export resnet256_log="perf_resnet50_N256.log"
-print_log_header $resnet256_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
-python3 process_perf_data.py $resnet256_log
-export resnet4_log="perf_resnet50_N4.log"
-print_log_header $resnet4_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
-python3 process_perf_data.py $resnet4_log
+export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log