Add full QA with verification option, few other changes. (#331)

* add verify flag and update scripts * replace old check_error function with the new check_err * fix syntax * remove blank spaces * remove empty line * add check_err for tensors * fix syntax * replace tensors with vectors in check_err calls * fix syntax * remove blank spaces * fix syntax * add new line at end of file * disable conv2d_bwd_weight test, add gpu check * set check_gpu using export * check GPU using runShell * add definition of runShell * fix script syntax * reduce the number of threads, add full qa option * run processing scripts in bash * fix the branch and host names in performance scripts, add chronos * replace parameterizedCron with cron * archive the perf log files * try to fix git call * pass branch and host names as arguments into scripts * fix script arguments * fix script arguments * process results on master * fix pipeline * add definition of gpu_arch * run processing scripts in docker * fix the brackets * add agent master for the processing stage * get rid of show_node_info call on master * try using mici label instead of master, disable MI100 tests for now * fix syntax * simplify container for results processing * remove node(master) from the process_results stage * put all stages in original order * change the agent label from master to mici for gfx908 [ROCm/composable_kernel commit: d8415a96b3]
2026-07-18 09:38:17 +00:00 · 2022-07-21 13:25:46 -07:00
parent cdb627bf1b
commit 6f6ae03ad8
16 changed files with 464 additions and 330 deletions
--- a/265
+++ b/265
@@ -11,6 +11,12 @@ def show_node_info() {
    """
 }

+def runShell(String command){
+    def responseCode = sh returnStatus: true, script: "${command} &> tmp.txt"
+    def output = readFile(file: "tmp.txt")
+    return (output != "")
+}
+
 def cmake_build(Map conf=[:]){

    def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
@@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){
        """
    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
    // reduce parallelism when compiling, clang uses too much memory
-    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 1 )) ${config_targets}")
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
    def execute_cmd = conf.get("execute_cmd", "")

    def cmd = conf.get("cmd", """
@@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + '.')
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){
            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
                timeout(time: 5, unit: 'HOURS')
                {
-                    sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                    sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                    if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                        echo "GPU not found"
+                        throw e
+                    }
+                    else{
+                        echo "GPU is OK"
+                    }
                    cmake_build(conf)
                }
            }
@@ -145,7 +172,6 @@ def reboot(){
    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
 }

-
 def buildHipClangJobAndReboot(Map conf=[:]){
    try{
        buildHipClangJob(conf)
@@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){
    }
 }

-
 def runCKProfiler(Map conf=[:]){
        show_node_info()

@@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){
        }

        def variant = env.STAGE_NAME
-
        def retimage

        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
@@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + '.')
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }
@@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){
                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
                withDockerContainer(image: image, args: dockerOpts) {
                    timeout(time: 5, unit: 'MINUTES'){
-                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            echo "GPU not found"
+                            throw e
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
                    }
                }
            }

            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 5, unit: 'HOURS')
+                timeout(time: 24, unit: 'HOURS')
                {
                    cmake_build(conf)
 					dir("script"){
-                        //run gemm performance tests
-                        def gemm_log = "perf_gemm_${gpu_arch}.log"
-                        sh "rm -f ${gemm_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${gemm_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${gemm_log}"
-                        if (params.USE_9110){
-                            sh "echo Environment type: CI_9110  >> ${gemm_log}"
+                        if (params.RUN_FULL_QA){
+                            def qa_log = "qa_${gpu_arch}.log"
+                            if (params.USE_9110){
+                                sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            else{
+                                sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+                            archiveArtifacts "perf_bathced_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_fwd_conv_${gpu_arch}.log"
+                            archiveArtifacts "perf_bwd_conv_${gpu_arch}.log"
+                            archiveArtifacts "perf_fusion_${gpu_arch}.log"
+                            archiveArtifacts "perf_reduction_${gpu_arch}.log"
+                           // stash perf files to master
+                            stash name: "perf_gemm_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
+                            stash name: "perf_bathced_gemm_${gpu_arch}.log"
+                            stash name: "perf_grouped_gemm_${gpu_arch}.log"
+                            stash name: "perf_fwd_conv_${gpu_arch}.log"
+                            stash name: "perf_bwd_conv_${gpu_arch}.log"
+                            stash name: "perf_fusion_${gpu_arch}.log"
+                            stash name: "perf_reduction_${gpu_arch}.log"
+                            //we will process results on the master node
                        }
                        else{
-                            sh "echo Environment type: CI_release  >> ${gemm_log}"
+                            if (params.USE_9110){
+                                sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            else{
+                                sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            }
+                            archiveArtifacts "perf_gemm_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log"
+                            archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log"
+                            // stash perf files to master
+                            stash name: "perf_gemm_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N256_${gpu_arch}.log"
+                            stash name: "perf_resnet50_N4_${gpu_arch}.log"
+                            //we will process the results on the master node
                        }
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}"
-                        sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}"
-                        //results will be parsed, stored, and analyzed within the python script
-                        //the script will return 0 if the performance criteria are met
-                        //or return 1 if the criteria are not met
-                        archiveArtifacts  "${gemm_log}"
-                        sh "python3 process_perf_data.py ${gemm_log} "
-                        //run resnet50 test
-                        def resnet256_log = "perf_resnet50_N256_${gpu_arch}.log"
-                        sh "rm -f ${resnet256_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${resnet256_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${resnet256_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${resnet256_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${resnet256_log}"
-                        if (params.USE_9110){
-                            sh "echo Environment type: CI_9110  >> ${resnet256_log}"
-                        }
-                        else{
-                            sh "echo Environment type: CI_release  >> ${resnet256_log}"
-                        }
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}"
-                        //first run tests with N=256
-                        sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}"
-                        archiveArtifacts  "${resnet256_log}"
-                        sh "python3 process_perf_data.py ${resnet256_log} "
-                        //then run with N=4
-                        def resnet4_log = "perf_resnet50_N4_${gpu_arch}.log"
-                        sh "rm -f ${resnet4_log}"
-                        sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}"
-                        sh "echo Node name: ${NODE_NAME} >> ${resnet4_log}"
-                        sh "echo GPU_arch name: ${gpu_arch}  >> ${resnet4_log}"
-                        sh "rocminfo | grep 'Compute Unit:' >> ${resnet4_log} "
-                        sh "hipcc --version | grep -e 'HIP version'  >> ${resnet4_log}"
-                        if (params.USE_9110){
-                            sh "echo Environment type: CI_9110  >> ${resnet4_log}"
-                        }
-                        else{
-                            sh "echo Environment type: CI_release  >> ${resnet4_log}"
-                        }
-                        sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}"
-                        sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}"
-                        archiveArtifacts  "${resnet4_log}"
-                        sh "python3 process_perf_data.py ${resnet4_log} "
+
 					}
                }
            }
@@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){
        return retimage
 }

-
 def runPerfTest(Map conf=[:]){
    try{
        runCKProfiler(conf)
@@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){
    }
 }

+def process_results(Map conf=[:]){
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    def image = "composable_kernels"
+    def prefixpath = "/opt/rocm"
+    def gpu_arch = conf.get("gpu_arch", "gfx908")
+
+    // Jenkins is complaining about the render group 
+    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    if (conf.get("enforce_xnack_on", false)) {
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1"
+    }
+    def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' "
+
+    def variant = env.STAGE_NAME
+    def retimage
+
+    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        try {
+            retimage = docker.build("${image}", dockerArgs + '.')
+        }
+        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+            echo "The job was cancelled or aborted"
+            throw e
+        }
+    }
+
+    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+        timeout(time: 1, unit: 'HOURS'){
+            try{
+                dir("script"){
+                    if (params.RUN_FULL_QA){
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        unstash "perf_bathced_gemm_${gpu_arch}.log"
+                        unstash "perf_grouped_gemm_${gpu_arch}.log"
+                        unstash "perf_fwd_conv_${gpu_arch}.log"
+                        unstash "perf_bwd_conv_${gpu_arch}.log"
+                        unstash "perf_fusion_${gpu_arch}.log"
+                        unstash "perf_reduction_${gpu_arch}.log"
+                        sh "./process_qa_data.sh ${gpu_arch}"
+                    }
+                    else{
+                        // unstash perf files to master
+                        unstash "perf_gemm_${gpu_arch}.log"
+                        unstash "perf_resnet50_N256_${gpu_arch}.log"
+                        unstash "perf_resnet50_N4_${gpu_arch}.log"
+                        sh "./process_perf_data.sh ${gpu_arch}"
+                    }
+                }
+            }
+            catch(e){
+                echo "throwing error exception while processing performance test results"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+        }
+    }
+}
+
+//launch develop branch daily at 23:00 in FULL_QA mode
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : ""
+
 pipeline {
    agent none
+    triggers {
+        cron(CRON_SETTINGS)
+    }
    options {
        parallelsAlwaysFailFast()
    }
@@ -325,7 +403,11 @@ pipeline {
        booleanParam(
            name: "USE_9110",
            defaultValue: true,
-            description: "")
+            description: "Select compiler version: 9110 (default) or release")
+        booleanParam(
+            name: "RUN_FULL_QA",
+            defaultValue: false,
+            description: "Select whether to run small set of performance tests (default) or full QA")
    }
    environment{
        dbuser = "${dbuser}"
@@ -438,6 +520,25 @@ pipeline {
                }
            }
        }
+        stage("Process Performance Test Results")
+        {
+            parallel
+            {
+                stage("Process results for gfx908"){
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx908")
+                    }
+                }
+                stage("Process results for gfx90a"){
+                    agent { label 'mici' }
+                    steps{
+                        process_results(gpu_arch: "gfx90a")
+                    }
+                }
+            }
+        }
+
        /* enable after the cmake file supports packaging
        stage("Packages") {
            when {
--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -381,52 +381,3 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
    : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
 {
 }
-
-#if 1
-// FIXME: remove
-template <typename T>
-float check_error(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float l1_error       = 0;
-    float linf_error     = -1;
-    float linf_rel_error = -1;
-
-    float linf_ref_value = 0, linf_result_value = 0;
-    float linf_rel_ref_value = 0, linf_rel_result_value = 0;
-
-    constexpr float eps = 1e-10;
-
-    for(std::size_t i = 0; i < ref.mData.size(); ++i)
-    {
-        float ref_v    = ck::type_convert<float>(ref.mData[i]);
-        float result_v = ck::type_convert<float>(result.mData[i]);
-
-        float diff     = std::abs(ref_v - result_v);
-        float rel_diff = diff / std::max(std::abs(ref_v), eps);
-
-        l1_error += diff;
-
-        if(linf_error < diff)
-        {
-            linf_error        = diff;
-            linf_ref_value    = ref_v;
-            linf_result_value = result_v;
-        }
-
-        if(linf_rel_error < rel_diff)
-        {
-            linf_rel_error        = rel_diff;
-            linf_rel_ref_value    = ref_v;
-            linf_rel_result_value = result_v;
-        }
-    }
-
-    std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
-    std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
-              << linf_ref_value << ", result " << linf_result_value << std::endl;
-    std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
-              << linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;
-
-    return linf_error;
-}
-#endif
--- a/library/include/ck/library/utility/check_err.hpp
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -29,9 +29,8 @@ check_err(const std::vector<T>& out,
 {
    if(out.size() != ref.size())
    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
+        std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
        return false;
    }

@@ -48,9 +47,8 @@ check_err(const std::vector<T>& out,
            err_count++;
            if(err_count < 5)
            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << out[i] << " != " << ref[i] << std::endl
-                          << msg << std::endl;
+                std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl;
            }
            res = false;
        }
@@ -72,9 +70,8 @@ check_err(const std::vector<T>& out,
 {
    if(out.size() != ref.size())
    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
+        std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
        return false;
    }

@@ -94,9 +91,8 @@ check_err(const std::vector<T>& out,
            err_count++;
            if(err_count < 5)
            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << o << " != " << r << std::endl
-                          << msg << std::endl;
+                std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
            }
            res = false;
        }
@@ -118,9 +114,8 @@ check_err(const std::vector<T>& out,
 {
    if(out.size() != ref.size())
    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
+        std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
        return false;
    }

@@ -139,9 +134,8 @@ check_err(const std::vector<T>& out,
            err_count++;
            if(err_count < 5)
            {
-                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
-                          << i << "]: " << o << " != " << r << std::endl
-                          << msg << std::endl;
+                std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
            }
            res = false;
        }
@@ -163,9 +157,8 @@ check_err(const std::vector<T>& out,
 {
    if(out.size() != ref.size())
    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
+        std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
        return false;
    }

@@ -185,9 +178,9 @@ check_err(const std::vector<T>& out,
            err_count++;
            if(err_count < 5)
            {
-                std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast<int>(out[i])
-                          << " != " << static_cast<int>(ref[i]) << std::endl
-                          << msg << std::endl;
+                std::cout << msg << " out[" << i << "] != ref[" << i
+                          << "]: " << static_cast<int>(out[i]) << " != " << static_cast<int>(ref[i])
+                          << std::endl;
            }
            res = false;
        }
--- a/profiler/include/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp
@@ -318,13 +318,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
                reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
                reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());

-                float c_error  = check_error(c_g_m_n_host_result, c_g_m_n_device_result);
-                float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result);
-                float d1_error = check_error(d1_g_m_host_result, d1_g_m_device_result);
+                bool c_error =
+                    ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
+                bool d0_error =
+                    ck::utils::check_err(d0_g_m_host_result.mData, d0_g_m_device_result.mData);
+                bool d1_error =
+                    ck::utils::check_err(d1_g_m_host_result.mData, d1_g_m_device_result.mData);

-                pass = pass && (c_error < 1E-6);
-                pass = pass && (d0_error < 1E-6);
-                pass = pass && (d1_error < 1E-6);
+                pass = pass && (c_error == true);
+                pass = pass && (d0_error == true);
+                pass = pass && (d1_error == true);

                if(do_log)
                {
--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -250,11 +250,11 @@ bool profile_conv_bwd_weight_impl(int do_verification,
            {
                wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data());

-                float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result);
+                pass = ck::utils::check_err(wei_k_c_y_x_host_result.mData,
+                                            wei_k_c_y_x_device_result.mData);

-                if(max_error > 8)
+                if(pass == false)
                {
-                    pass = false;
                    std::cout << "Fail info:" << conv_ptr->GetTypeString() << std::endl;
                }

--- a/profiler/include/profile_convnd_bwd_data_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_data_impl.hpp
@@ -8,6 +8,7 @@
 #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/conv_util.hpp"
 #include "ck/library/host_tensor/device_memory.hpp"
 #include "ck/library/host_tensor/host_tensor.hpp"
@@ -452,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
                }

-                check_error(input_host_result, input_device_result);
+                success = ck::utils::check_err(input_host_result.mData, input_device_result.mData);

                if(do_log)
                {
--- a/profiler/include/profile_convnd_bwd_weight_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_weight_impl.hpp
@@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
        {
            wei_device_buf.FromDevice(weights_device_result.mData.data());

-            float max_error = check_error(weights_host_result, weights_device_result);
+            success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);

-            if(max_error > 8)
+            if(success == false)
            {
                std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
-
-                success = false;
            }
            else
            {
                std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
            }

-            check_error(weights_host_result, weights_device_result);
-
            if(do_log)
            {
                std::cout << "in : ";
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -85,7 +85,6 @@ def parse_logfile(logfile):
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
-                print("len(lst)=",len(lst),"lst:",lst)
                if len(lst)>=37: #the line is complete
                    tests.append(glue.join(lst[5:30]))
                    kernels.append(glue.join(lst[37:]))
@@ -293,4 +292,4 @@ def main():
    return regression

 if __name__ == '__main__':
-    main()
+    main()
--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+pip3 install --upgrade pip
+pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+gpu_arch=$1
+python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -0,0 +1,22 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+pip3 install --upgrade pip
+pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+gpu_arch=$1
+python3 process_perf_data.py perf_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log
+python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log
+python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log
+python3 process_perf_data.py perf_fwd_conv_"$gpu_arch".log
+python3 process_perf_data.py perf_bwd_conv_"$gpu_arch".log
+python3 process_perf_data.py perf_fusion_"$gpu_arch".log
+python3 process_perf_data.py perf_reduction_"$gpu_arch".log
--- a/script/profile_batched_gemm.sh
+++ b/script/profile_batched_gemm.sh
@@ -11,26 +11,34 @@ INIT=$5
 LOG=$6
 REPEAT=$7
 
-########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  960  1024 1024       -1     -1      -1            8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920  2048 2048       -1     -1      -1            8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840  4096 4096       -1     -1      -1            4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680  8192 8192       -1     -1      -1            2
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+REPEAT=$7
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1024    1024    1024           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2048    2048    2048           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4096    4096    4096           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8192    8192    8192           2
+########  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT  960  1024 1024       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920  2048 2048       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840  4096 4096       -1     -1      -1            -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680  8192 8192       -1     -1      -1            -1           -1           -1          2
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC   BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1056    1056    1056           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2080    2080    2080           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4128    4128    4128           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8224    8224    8224           2
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1024    1024    1024           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2048    2048    2048           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4096    4096    4096           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8192    8192    8192           -1           -1           -1          2
 
-#######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC    BatchCount
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1088    1088    1088           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2112    2112    2112           8
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4160    4160    4160           4
-$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8256    8256    8256           2
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1056    1056    1056           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2080    2080    2080           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4128    4128    4128           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8224    8224    8224           -1           -1           -1          2
+ 
+ #######  op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024     1088    1088    1088           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048     2112    2112    2112           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096     4160    4160    4160           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192     8256    8256    8256           -1           -1           -1          2
--- a/script/profile_gemm_bilinear.sh
+++ b/script/profile_gemm_bilinear.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1      -1      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000  1000 1000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000  2000 2000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000  4000 4000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000  8000 8000       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1056    1056    1056    1056     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2080    2080    2080    2080     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4128    4128    4128    4128     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8224    8224    8224    8224     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1088    1088    1088    1088     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2112    2112    2112    2112     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4160    4160    4160    4160     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8256    8256    8256    8256     1    1
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -1,124 +1,124 @@
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# and make sure the following python packages are installed in your environment:
-
-pip3 install --upgrade pip
-pip3 install sqlalchemy pymysql pandas sshtunnel
-
 # you would also need to set up some environment variables in order to 
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <tag for your test environment>
-
-#get the test environment type:
-export env_type=$1
-echo 'Environment type ' $env_type
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verifuy correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname

+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export gpu_arch=$3
+echo 'GPU architecture: ' $gpu_arch
+export branch=$4
+echo 'Branch name: ' $branch
+export host_name=$5
+echo 'Host name: ' $host_name
 function print_log_header(){
 	rm -f $1;
-	git status | grep -e 'On branch' > $1;
-	echo -n 'Node name: ' >>$1; hostname >> $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
 	#get GPU_arch and number of compute units from rocminfo
 	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
 	rocminfo | grep "Compute Unit:" >> $1;
 	hipcc --version | grep -e 'HIP version'  >> $1;
-	echo 'Environment type: ' $2 >>$1;
+	echo 'Environment type: ' $2 >> $1;
 	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
 }

 #run gemm tests
-export gemm_log="perf_gemm.log"
-print_log_header $gemm_log $env_type
-./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
-python3 process_perf_data.py $gemm_log
+export gemm_log="perf_gemm_${gpu_arch}.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log

 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256.log"
-print_log_header $resnet256_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
-python3 process_perf_data.py $resnet256_log
-export resnet4_log="perf_resnet50_N4.log"
-print_log_header $resnet4_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
-python3 process_perf_data.py $resnet4_log
+export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log

 #run batched_gemm tests
-export batched_gemm_log="perf_batched_gemm.log"
-print_log_header $batched_gemm_log $env_type
-./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 | tee -a $batched_gemm_log
-./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 | tee -a $batched_gemm_log
-python3 process_perf_data.py $batched_gemm_log
+export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
+print_log_header $batched_gemm_log $env_type $branch $host_name
+./profile_batched_gemm.sh batched_gemm 0 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 3 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 0 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 1 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 2 $verify 2 0 5 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 3 $verify 2 0 5 | tee -a $batched_gemm_log

 #run grouped_gemm tests
-export grouped_gemm_log="perf_grouped_gemm.log"
-print_log_header $grouped_gemm_log $env_type
-./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 | tee -a $grouped_gemm_log
-./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 | tee -a $grouped_gemm_log
-python3 process_perf_data.py $grouped_gemm_log
+export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
+print_log_header $grouped_gemm_log $env_type $branch $host_name
+./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 2 0 5 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 2 0 5 | tee -a $grouped_gemm_log

 #run fwd_conv tests
-export fwd_conv_log="perf_fwd_conv.log"
-print_log_header $fwd_conv_log $env_type
-./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 | tee -a $fwd_conv_log
-python3 process_perf_data.py $fwd_conv_log
+export fwd_conv_log="perf_fwd_conv_${gpu_arch}.log"
+print_log_header $fwd_conv_log $env_type $branch $host_name
+./profile_conv.sh conv_fwd 0 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 1 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 2 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log
+./profile_conv.sh conv_fwd 3 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log

 #run bwd_conv tests
-export bwd_conv_log="perf_bwd_conv.log"
-print_log_header $bwd_conv_log $env_type
-./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log
-python3 process_perf_data.py $bwd_conv_log
+export bwd_conv_log="perf_bwd_conv_${gpu_arch}.log"
+print_log_header $bwd_conv_log $env_type $branch $host_name
+./profile_conv.sh conv2d_bwd_data 0 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 1 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 2 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log
+./profile_conv.sh conv2d_bwd_data 3 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log

 #run fusion tests
-export fusion_log="perf_fusion.log"
-print_log_header $fusion_log $env_type
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 | tee -a $fusion_log
-./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 | tee -a $fusion_log
-python3 process_perf_data.py $fusion_log
+export fusion_log="perf_fusion_${gpu_arch}.log"
+print_log_header $fusion_log $env_type $branch $host_name
+./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 2 0 1 | tee -a $fusion_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 2 0 1 | tee -a $fusion_log

 #run reduction tests
-export reduction_log="perf_reduction.log"
-print_log_header $reduction_log $env_type
-./profile_reduce_with_index.sh 0 2 10 --half | tee -a $reduction_log
-./profile_reduce_no_index.sh 0 2 10 --half | tee -a $reduction_log
-python3 process_perf_data.py $reduction_log
+export reduction_log="perf_reduction_${gpu_arch}.log"
+print_log_header $reduction_log $env_type $branch $host_name
+./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log
+./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -1,59 +1,62 @@
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# and make sure the following python packages are installed in your environment:
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verify correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname

-pip3 install --upgrade pip
-pip3 install sqlalchemy pymysql pandas sshtunnel
-
-# you would also need to set up some environment variables in order to 
-# post your new test results to the database and compare them to the baseline
-# please contact Illia.Silin@amd.com for more details
-#
-# run the script as "./run_performance_tests.sh <tag for your test environment>
-
-#get the test environment type:
-export env_type=$1
-echo 'Environment type ' $env_type
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export gpu_arch=$3
+echo 'GPU architecture: ' $gpu_arch
+export branch=$4
+echo 'Branch name: ' $branch
+export host_name=$5
+echo 'Host name: ' $host_name

 function print_log_header(){
 	rm -f $1;
-	git status | grep -e 'On branch' > $1;
-	echo -n 'Node name: ' >>$1; hostname >> $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
 	#get GPU_arch and number of compute units from rocminfo
 	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
 	rocminfo | grep "Compute Unit:" >> $1;
 	hipcc --version | grep -e 'HIP version'  >> $1;
-	echo 'Environment type: ' $2 >>$1;
+	echo 'Environment type: ' $2 >> $1;
 	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
 }
 #run gemm tests
-export gemm_log="perf_gemm.log"
-print_log_header $gemm_log $env_type
-./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
-./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
-python3 process_perf_data.py $gemm_log
+export gemm_log="perf_gemm_${gpu_arch}.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log

 #run resnet50 test
-export resnet256_log="perf_resnet50_N256.log"
-print_log_header $resnet256_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log
-python3 process_perf_data.py $resnet256_log
-export resnet4_log="perf_resnet50_N4.log"
-print_log_header $resnet4_log $env_type
-./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log
-python3 process_perf_data.py $resnet4_log
+export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log
--- a/test/conv2d_bwd_weight/CMakeLists.txt
+++ b/test/conv2d_bwd_weight/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
-target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
+#add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
+#target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)