diff --git a/Jenkinsfile b/Jenkinsfile index 74b06cdba3..c8137d9328 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -11,6 +11,12 @@ def show_node_info() { """ } +def runShell(String command){ + def responseCode = sh returnStatus: true, script: "${command} &> tmp.txt" + def output = readFile(file: "tmp.txt") + return (output != "") +} + def cmake_build(Map conf=[:]){ def compiler = conf.get("compiler","/opt/rocm/bin/hipcc") @@ -60,7 +66,7 @@ def cmake_build(Map conf=[:]){ """ def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ") // reduce parallelism when compiling, clang uses too much memory - def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}") + def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 2 )) ${config_targets}") def execute_cmd = conf.get("execute_cmd", "") def cmd = conf.get("cmd", """ @@ -113,7 +119,14 @@ def buildHipClangJob(Map conf=[:]){ retimage = docker.build("${image}", dockerArgs + '.') withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){ - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + echo "GPU not found" + throw e + } + else{ + echo "GPU is OK" + } } } } @@ -125,7 +138,14 @@ def buildHipClangJob(Map conf=[:]){ retimage = docker.build("${image}", dockerArgs + " --no-cache .") withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){ - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + echo "GPU not found" + throw e + } + else{ + echo "GPU is OK" + } } } } @@ -133,7 +153,14 @@ def buildHipClangJob(Map conf=[:]){ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { timeout(time: 5, unit: 'HOURS') { - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + echo "GPU not found" + throw e + } + else{ + echo "GPU is OK" + } cmake_build(conf) } } @@ -145,7 +172,6 @@ def reboot(){ build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] } - def buildHipClangJobAndReboot(Map conf=[:]){ try{ buildHipClangJob(conf) @@ -162,7 +188,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ } } - def runCKProfiler(Map conf=[:]){ show_node_info() @@ -189,7 +214,6 @@ def runCKProfiler(Map conf=[:]){ } def variant = env.STAGE_NAME - def retimage gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { @@ -197,7 +221,14 @@ def runCKProfiler(Map conf=[:]){ retimage = docker.build("${image}", dockerArgs + '.') withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){ - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + echo "GPU not found" + throw e + } + else{ + echo "GPU is OK" + } } } } @@ -209,89 +240,69 @@ def runCKProfiler(Map conf=[:]){ retimage = docker.build("${image}", dockerArgs + " --no-cache .") withDockerContainer(image: image, args: dockerOpts) { timeout(time: 5, unit: 'MINUTES'){ - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + echo "GPU not found" + throw e + } + else{ + echo "GPU is OK" + } } } } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 5, unit: 'HOURS') + timeout(time: 24, unit: 'HOURS') { cmake_build(conf) dir("script"){ - //run gemm performance tests - def gemm_log = "perf_gemm_${gpu_arch}.log" - sh "rm -f ${gemm_log}" - sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}" - sh "echo Node name: ${NODE_NAME} >> ${gemm_log}" - sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}" - sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} " - sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}" - if (params.USE_9110){ - sh "echo Environment type: CI_9110 >> ${gemm_log}" + if (params.RUN_FULL_QA){ + def qa_log = "qa_${gpu_arch}.log" + if (params.USE_9110){ + sh "./run_full_performance_tests.sh 1 QA_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}" + } + else{ + sh "./run_full_performance_tests.sh 1 QA_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}" + } + archiveArtifacts "perf_gemm_${gpu_arch}.log" + archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log" + archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log" + archiveArtifacts "perf_bathced_gemm_${gpu_arch}.log" + archiveArtifacts "perf_grouped_gemm_${gpu_arch}.log" + archiveArtifacts "perf_fwd_conv_${gpu_arch}.log" + archiveArtifacts "perf_bwd_conv_${gpu_arch}.log" + archiveArtifacts "perf_fusion_${gpu_arch}.log" + archiveArtifacts "perf_reduction_${gpu_arch}.log" + // stash perf files to master + stash name: "perf_gemm_${gpu_arch}.log" + stash name: "perf_resnet50_N256_${gpu_arch}.log" + stash name: "perf_resnet50_N4_${gpu_arch}.log" + stash name: "perf_bathced_gemm_${gpu_arch}.log" + stash name: "perf_grouped_gemm_${gpu_arch}.log" + stash name: "perf_fwd_conv_${gpu_arch}.log" + stash name: "perf_bwd_conv_${gpu_arch}.log" + stash name: "perf_fusion_${gpu_arch}.log" + stash name: "perf_reduction_${gpu_arch}.log" + //we will process results on the master node } else{ - sh "echo Environment type: CI_release >> ${gemm_log}" + if (params.USE_9110){ + sh "./run_performance_tests.sh 0 CI_9110 ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}" + } + else{ + sh "./run_performance_tests.sh 0 CI_release ${gpu_arch} ${env.BRANCH_NAME} ${NODE_NAME}" + } + archiveArtifacts "perf_gemm_${gpu_arch}.log" + archiveArtifacts "perf_resnet50_N256_${gpu_arch}.log" + archiveArtifacts "perf_resnet50_N4_${gpu_arch}.log" + // stash perf files to master + stash name: "perf_gemm_${gpu_arch}.log" + stash name: "perf_resnet50_N256_${gpu_arch}.log" + stash name: "perf_resnet50_N4_${gpu_arch}.log" + //we will process the results on the master node } - sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}" - sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${gemm_log}" - sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${gemm_log}" - //results will be parsed, stored, and analyzed within the python script - //the script will return 0 if the performance criteria are met - //or return 1 if the criteria are not met - archiveArtifacts "${gemm_log}" - sh "python3 process_perf_data.py ${gemm_log} " - //run resnet50 test - def resnet256_log = "perf_resnet50_N256_${gpu_arch}.log" - sh "rm -f ${resnet256_log}" - sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet256_log}" - sh "echo Node name: ${NODE_NAME} >> ${resnet256_log}" - sh "echo GPU_arch name: ${gpu_arch} >> ${resnet256_log}" - sh "rocminfo | grep 'Compute Unit:' >> ${resnet256_log} " - sh "hipcc --version | grep -e 'HIP version' >> ${resnet256_log}" - if (params.USE_9110){ - sh "echo Environment type: CI_9110 >> ${resnet256_log}" - } - else{ - sh "echo Environment type: CI_release >> ${resnet256_log}" - } - sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet256_log}" - //first run tests with N=256 - sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet256_log}" - archiveArtifacts "${resnet256_log}" - sh "python3 process_perf_data.py ${resnet256_log} " - //then run with N=4 - def resnet4_log = "perf_resnet50_N4_${gpu_arch}.log" - sh "rm -f ${resnet4_log}" - sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet4_log}" - sh "echo Node name: ${NODE_NAME} >> ${resnet4_log}" - sh "echo GPU_arch name: ${gpu_arch} >> ${resnet4_log}" - sh "rocminfo | grep 'Compute Unit:' >> ${resnet4_log} " - sh "hipcc --version | grep -e 'HIP version' >> ${resnet4_log}" - if (params.USE_9110){ - sh "echo Environment type: CI_9110 >> ${resnet4_log}" - } - else{ - sh "echo Environment type: CI_release >> ${resnet4_log}" - } - sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet4_log}" - sh "./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet4_log}" - archiveArtifacts "${resnet4_log}" - sh "python3 process_perf_data.py ${resnet4_log} " + } } } @@ -299,7 +310,6 @@ def runCKProfiler(Map conf=[:]){ return retimage } - def runPerfTest(Map conf=[:]){ try{ runCKProfiler(conf) @@ -316,8 +326,76 @@ def runPerfTest(Map conf=[:]){ } } +def process_results(Map conf=[:]){ + env.HSA_ENABLE_SDMA=0 + checkout scm + def image = "composable_kernels" + def prefixpath = "/opt/rocm" + def gpu_arch = conf.get("gpu_arch", "gfx908") + + // Jenkins is complaining about the render group + def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1" + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg compiler_version='release' " + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { + try { + retimage = docker.build("${image}", dockerArgs + '.') + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 1, unit: 'HOURS'){ + try{ + dir("script"){ + if (params.RUN_FULL_QA){ + // unstash perf files to master + unstash "perf_gemm_${gpu_arch}.log" + unstash "perf_resnet50_N256_${gpu_arch}.log" + unstash "perf_resnet50_N4_${gpu_arch}.log" + unstash "perf_bathced_gemm_${gpu_arch}.log" + unstash "perf_grouped_gemm_${gpu_arch}.log" + unstash "perf_fwd_conv_${gpu_arch}.log" + unstash "perf_bwd_conv_${gpu_arch}.log" + unstash "perf_fusion_${gpu_arch}.log" + unstash "perf_reduction_${gpu_arch}.log" + sh "./process_qa_data.sh ${gpu_arch}" + } + else{ + // unstash perf files to master + unstash "perf_gemm_${gpu_arch}.log" + unstash "perf_resnet50_N256_${gpu_arch}.log" + unstash "perf_resnet50_N4_${gpu_arch}.log" + sh "./process_perf_data.sh ${gpu_arch}" + } + } + } + catch(e){ + echo "throwing error exception while processing performance test results" + echo 'Exception occurred: ' + e.toString() + throw e + } + } + } +} + +//launch develop branch daily at 23:00 in FULL_QA mode +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;USE_9110=true''' : "" + pipeline { agent none + triggers { + cron(CRON_SETTINGS) + } options { parallelsAlwaysFailFast() } @@ -325,7 +403,11 @@ pipeline { booleanParam( name: "USE_9110", defaultValue: true, - description: "") + description: "Select compiler version: 9110 (default) or release") + booleanParam( + name: "RUN_FULL_QA", + defaultValue: false, + description: "Select whether to run small set of performance tests (default) or full QA") } environment{ dbuser = "${dbuser}" @@ -438,6 +520,25 @@ pipeline { } } } + stage("Process Performance Test Results") + { + parallel + { + stage("Process results for gfx908"){ + agent { label 'mici' } + steps{ + process_results(gpu_arch: "gfx908") + } + } + stage("Process results for gfx90a"){ + agent { label 'mici' } + steps{ + process_results(gpu_arch: "gfx90a") + } + } + } + } + /* enable after the cmake file supports packaging stage("Packages") { when { diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp index 1bef9dace0..caa18e6dd1 100644 --- a/library/include/ck/library/host_tensor/host_tensor.hpp +++ b/library/include/ck/library/host_tensor/host_tensor.hpp @@ -381,52 +381,3 @@ HostTensorDescriptor::HostTensorDescriptor(const std::vector& lens, : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) { } - -#if 1 -// FIXME: remove -template -float check_error(const Tensor& ref, const Tensor& result) -{ - float l1_error = 0; - float linf_error = -1; - float linf_rel_error = -1; - - float linf_ref_value = 0, linf_result_value = 0; - float linf_rel_ref_value = 0, linf_rel_result_value = 0; - - constexpr float eps = 1e-10; - - for(std::size_t i = 0; i < ref.mData.size(); ++i) - { - float ref_v = ck::type_convert(ref.mData[i]); - float result_v = ck::type_convert(result.mData[i]); - - float diff = std::abs(ref_v - result_v); - float rel_diff = diff / std::max(std::abs(ref_v), eps); - - l1_error += diff; - - if(linf_error < diff) - { - linf_error = diff; - linf_ref_value = ref_v; - linf_result_value = result_v; - } - - if(linf_rel_error < rel_diff) - { - linf_rel_error = rel_diff; - linf_rel_ref_value = ref_v; - linf_rel_result_value = result_v; - } - } - - std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl; - std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref " - << linf_ref_value << ", result " << linf_result_value << std::endl; - std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref " - << linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl; - - return linf_error; -} -#endif diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 0b82ba4357..fef0d8e033 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -29,9 +29,8 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() - << std::endl - << msg << std::endl; + std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + << std::endl; return false; } @@ -48,9 +47,8 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" - << i << "]: " << out[i] << " != " << ref[i] << std::endl - << msg << std::endl; + std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl; } res = false; } @@ -72,9 +70,8 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() - << std::endl - << msg << std::endl; + std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + << std::endl; return false; } @@ -94,9 +91,8 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" - << i << "]: " << o << " != " << r << std::endl - << msg << std::endl; + std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + << "] != ref[" << i << "]: " << o << " != " << r << std::endl; } res = false; } @@ -118,9 +114,8 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() - << std::endl - << msg << std::endl; + std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + << std::endl; return false; } @@ -139,9 +134,8 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref[" - << i << "]: " << o << " != " << r << std::endl - << msg << std::endl; + std::cout << msg << std::setw(12) << std::setprecision(7) << " out[" << i + << "] != ref[" << i << "]: " << o << " != " << r << std::endl; } res = false; } @@ -163,9 +157,8 @@ check_err(const std::vector& out, { if(out.size() != ref.size()) { - std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size() - << std::endl - << msg << std::endl; + std::cout << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size() + << std::endl; return false; } @@ -185,9 +178,9 @@ check_err(const std::vector& out, err_count++; if(err_count < 5) { - std::cout << "out[" << i << "] != ref[" << i << "]: " << static_cast(out[i]) - << " != " << static_cast(ref[i]) << std::endl - << msg << std::endl; + std::cout << msg << " out[" << i << "] != ref[" << i + << "]: " << static_cast(out[i]) << " != " << static_cast(ref[i]) + << std::endl; } res = false; } diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index b7dc979577..d1a989348a 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -318,13 +318,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification, reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data()); reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data()); - float c_error = check_error(c_g_m_n_host_result, c_g_m_n_device_result); - float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result); - float d1_error = check_error(d1_g_m_host_result, d1_g_m_device_result); + bool c_error = + ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData); + bool d0_error = + ck::utils::check_err(d0_g_m_host_result.mData, d0_g_m_device_result.mData); + bool d1_error = + ck::utils::check_err(d1_g_m_host_result.mData, d1_g_m_device_result.mData); - pass = pass && (c_error < 1E-6); - pass = pass && (d0_error < 1E-6); - pass = pass && (d1_error < 1E-6); + pass = pass && (c_error == true); + pass = pass && (d0_error == true); + pass = pass && (d1_error == true); if(do_log) { diff --git a/profiler/include/profile_conv_bwd_weight_impl.hpp b/profiler/include/profile_conv_bwd_weight_impl.hpp index 9820d978fd..c677eb3538 100644 --- a/profiler/include/profile_conv_bwd_weight_impl.hpp +++ b/profiler/include/profile_conv_bwd_weight_impl.hpp @@ -250,11 +250,11 @@ bool profile_conv_bwd_weight_impl(int do_verification, { wei_device_buf.FromDevice(wei_k_c_y_x_device_result.mData.data()); - float max_error = check_error(wei_k_c_y_x_host_result, wei_k_c_y_x_device_result); + pass = ck::utils::check_err(wei_k_c_y_x_host_result.mData, + wei_k_c_y_x_device_result.mData); - if(max_error > 8) + if(pass == false) { - pass = false; std::cout << "Fail info:" << conv_ptr->GetTypeString() << std::endl; } diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index 676e619b49..cf9ae8dff1 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -8,6 +8,7 @@ #include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/conv_util.hpp" #include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/host_tensor/host_tensor.hpp" @@ -452,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; } - check_error(input_host_result, input_device_result); + success = ck::utils::check_err(input_host_result.mData, input_device_result.mData); if(do_log) { diff --git a/profiler/include/profile_convnd_bwd_weight_impl.hpp b/profiler/include/profile_convnd_bwd_weight_impl.hpp index c32abd96b3..8a6897a994 100644 --- a/profiler/include/profile_convnd_bwd_weight_impl.hpp +++ b/profiler/include/profile_convnd_bwd_weight_impl.hpp @@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification, { wei_device_buf.FromDevice(weights_device_result.mData.data()); - float max_error = check_error(weights_host_result, weights_device_result); + success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); - if(max_error > 8) + if(success == false) { std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl; - - success = false; } else { std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; } - check_error(weights_host_result, weights_device_result); - if(do_log) { std::cout << "in : "; diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh old mode 100644 new mode 100755 diff --git a/script/process_perf_data.py b/script/process_perf_data.py index fc01dd5934..822601e3a0 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -85,7 +85,6 @@ def parse_logfile(logfile): for line in open(logfile): if 'Best Perf' in line: lst=line.split() - print("len(lst)=",len(lst),"lst:",lst) if len(lst)>=37: #the line is complete tests.append(glue.join(lst[5:30])) kernels.append(glue.join(lst[37:])) @@ -293,4 +292,4 @@ def main(): return regression if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh new file mode 100755 index 0000000000..412f87d0e3 --- /dev/null +++ b/script/process_perf_data.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# in order to run this script you'd need the following python packages: + +pip3 install --upgrade pip +pip3 install sqlalchemy pymysql pandas sshtunnel + +# you would also need to set up some environment variables in order to +# post your new test results to the database and compare them to the baseline +# please contact Illia.Silin@amd.com for more details + +#process results +gpu_arch=$1 +python3 process_perf_data.py perf_gemm_"$gpu_arch".log +python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log +python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log \ No newline at end of file diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh new file mode 100755 index 0000000000..e5947933d1 --- /dev/null +++ b/script/process_qa_data.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# in order to run this script you'd need the following python packages: + +pip3 install --upgrade pip +pip3 install sqlalchemy pymysql pandas sshtunnel + +# you would also need to set up some environment variables in order to +# post your new test results to the database and compare them to the baseline +# please contact Illia.Silin@amd.com for more details + +#process results +gpu_arch=$1 +python3 process_perf_data.py perf_gemm_"$gpu_arch".log +python3 process_perf_data.py perf_resnet50_N265_"$gpu_arch".log +python3 process_perf_data.py perf_resnet50_N4_"$gpu_arch".log +python3 process_perf_data.py perf_batched_gemm_"$gpu_arch".log +python3 process_perf_data.py perf_grouped_gemm_"$gpu_arch".log +python3 process_perf_data.py perf_fwd_conv_"$gpu_arch".log +python3 process_perf_data.py perf_bwd_conv_"$gpu_arch".log +python3 process_perf_data.py perf_fusion_"$gpu_arch".log +python3 process_perf_data.py perf_reduction_"$gpu_arch".log \ No newline at end of file diff --git a/script/profile_batched_gemm.sh b/script/profile_batched_gemm.sh index eea4417dbf..ca34e03e14 100755 --- a/script/profile_batched_gemm.sh +++ b/script/profile_batched_gemm.sh @@ -11,26 +11,34 @@ INIT=$5 LOG=$6 REPEAT=$7 -######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 4 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 2 +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +REPEAT=$7 -####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 4 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 2 +######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 -1 -1 -1 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 -1 -1 -1 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 -1 -1 -1 -1 -1 -1 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 -1 -1 -1 -1 -1 -1 2 -####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 4 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 2 + ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4096 4096 4096 -1 -1 -1 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8192 8192 8192 -1 -1 -1 2 -####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchCount -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 8 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 4 -$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 2 \ No newline at end of file + ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1056 1056 1056 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2080 2080 2080 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4128 4128 4128 -1 -1 -1 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8224 8224 8224 -1 -1 -1 2 + + ####### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1088 1088 1088 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2112 2112 2112 -1 -1 -1 8 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096 4096 4096 4160 4160 4160 -1 -1 -1 4 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192 8192 8192 8256 8256 8256 -1 -1 -1 2 \ No newline at end of file diff --git a/script/profile_gemm_bilinear.sh b/script/profile_gemm_bilinear.sh new file mode 100755 index 0000000000..e6edefae85 --- /dev/null +++ b/script/profile_gemm_bilinear.sh @@ -0,0 +1,41 @@ +#!/bin/bash +## GPU visibility +export HIP_VISIBLE_DEVICES=0 +DRIVER="../build/bin/ckProfiler" +OP=$1 +DATATYPE=$2 +LAYOUT=$3 +VERIFY=$4 +INIT=$5 +LOG=$6 +TIME=$7 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 -1 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 -1 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 -1 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 -1 -1 1 1 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 960 1024 1024 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920 2048 2048 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840 4096 4096 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680 8192 8192 -1 -1 0 -1 1 1 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000 1000 1000 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000 2000 2000 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000 4000 4000 -1 -1 0 -1 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000 8000 8000 -1 -1 0 -1 1 1 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1056 1056 1056 1056 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2080 2080 2080 2080 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4128 4128 4128 4128 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8224 8224 8224 8224 1 1 + +######## op datatype layout verify init log time M___ N___ K___ StrideA StrideB StrideD StrideE Alpha Beta + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024 1024 1024 1088 1088 1088 1088 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048 2048 2048 2112 2112 2112 2112 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096 4096 4096 4160 4160 4160 4160 1 1 + $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192 8192 8192 8256 8256 8256 8256 1 1 \ No newline at end of file diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index e4cdab558e..bfb90b0a62 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -1,124 +1,124 @@ #!/bin/bash # # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ -# and make sure the following python packages are installed in your environment: - -pip3 install --upgrade pip -pip3 install sqlalchemy pymysql pandas sshtunnel - # you would also need to set up some environment variables in order to # post your new test results to the database and compare them to the baseline # please contact Illia.Silin@amd.com for more details # -# run the script as "./run_full_performance_tests.sh - -#get the test environment type: -export env_type=$1 -echo 'Environment type ' $env_type +# run the script as "./run_full_performance_tests.sh < node name> +# input arguments: +# verification = 0 : do not verify result correctness on CPU +# = 1 : verifuy correctness on CPU (may take a long time) +# environment tag : a string describing the specifics of your test environment +# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a". +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# node name : $hostname +#get the command line arguments: +export verify=$1 +echo 'Verification: ' $verify +export env_type=$2 +echo 'Environment type: ' $env_type +export gpu_arch=$3 +echo 'GPU architecture: ' $gpu_arch +export branch=$4 +echo 'Branch name: ' $branch +export host_name=$5 +echo 'Host name: ' $host_name function print_log_header(){ rm -f $1; - git status | grep -e 'On branch' > $1; - echo -n 'Node name: ' >>$1; hostname >> $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; #get GPU_arch and number of compute units from rocminfo echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; rocminfo | grep "Compute Unit:" >> $1; hipcc --version | grep -e 'HIP version' >> $1; - echo 'Environment type: ' $2 >>$1; + echo 'Environment type: ' $2 >> $1; /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; } #run gemm tests -export gemm_log="perf_gemm.log" -print_log_header $gemm_log $env_type -./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log -python3 process_perf_data.py $gemm_log +export gemm_log="perf_gemm_${gpu_arch}.log" +print_log_header $gemm_log $env_type $branch $host_name +./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log #run resnet50 tests -export resnet256_log="perf_resnet50_N256.log" -print_log_header $resnet256_log $env_type -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log -python3 process_perf_data.py $resnet256_log -export resnet4_log="perf_resnet50_N4.log" -print_log_header $resnet4_log $env_type -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log -python3 process_perf_data.py $resnet4_log +export resnet256_log="perf_resnet50_N256_${gpu_arch}.log" +print_log_header $resnet256_log $env_type $branch $host_name +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log +export resnet4_log="perf_resnet50_N4_${gpu_arch}.log" +print_log_header $resnet4_log $env_type $branch $host_name +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log #run batched_gemm tests -export batched_gemm_log="perf_batched_gemm.log" -print_log_header $batched_gemm_log $env_type -./profile_batched_gemm.sh batched_gemm 0 0 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 1 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 2 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 0 3 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 0 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 1 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 2 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 1 3 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 0 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 1 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 2 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 2 3 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 0 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 1 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 2 0 2 0 5 | tee -a $batched_gemm_log -./profile_batched_gemm.sh batched_gemm 3 3 0 2 0 5 | tee -a $batched_gemm_log -python3 process_perf_data.py $batched_gemm_log +export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log" +print_log_header $batched_gemm_log $env_type $branch $host_name +./profile_batched_gemm.sh batched_gemm 0 0 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 1 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 2 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 0 3 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 0 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 1 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 2 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 1 3 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 0 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 1 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 2 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 2 3 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 0 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 1 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 2 $verify 2 0 5 | tee -a $batched_gemm_log +./profile_batched_gemm.sh batched_gemm 3 3 $verify 2 0 5 | tee -a $batched_gemm_log #run grouped_gemm tests -export grouped_gemm_log="perf_grouped_gemm.log" -print_log_header $grouped_gemm_log $env_type -./profile_grouped_gemm.sh grouped_gemm 1 0 0 2 0 5 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 1 0 2 0 5 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 2 0 2 0 5 | tee -a $grouped_gemm_log -./profile_grouped_gemm.sh grouped_gemm 1 3 0 2 0 5 | tee -a $grouped_gemm_log -python3 process_perf_data.py $grouped_gemm_log +export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log" +print_log_header $grouped_gemm_log $env_type $branch $host_name +./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 2 0 5 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 2 0 5 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 2 0 5 | tee -a $grouped_gemm_log +./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 2 0 5 | tee -a $grouped_gemm_log #run fwd_conv tests -export fwd_conv_log="perf_fwd_conv.log" -print_log_header $fwd_conv_log $env_type -./profile_conv.sh conv_fwd 0 1 0 2 0 5 2 256 | tee -a $fwd_conv_log -./profile_conv.sh conv_fwd 1 1 0 2 0 5 2 256 | tee -a $fwd_conv_log -./profile_conv.sh conv_fwd 2 1 0 2 0 5 2 256 | tee -a $fwd_conv_log -./profile_conv.sh conv_fwd 3 1 0 2 0 5 2 256 | tee -a $fwd_conv_log -python3 process_perf_data.py $fwd_conv_log +export fwd_conv_log="perf_fwd_conv_${gpu_arch}.log" +print_log_header $fwd_conv_log $env_type $branch $host_name +./profile_conv.sh conv_fwd 0 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log +./profile_conv.sh conv_fwd 1 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log +./profile_conv.sh conv_fwd 2 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log +./profile_conv.sh conv_fwd 3 1 $verify 2 0 5 2 256 | tee -a $fwd_conv_log #run bwd_conv tests -export bwd_conv_log="perf_bwd_conv.log" -print_log_header $bwd_conv_log $env_type -./profile_conv.sh conv2d_bwd_data 0 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log -./profile_conv.sh conv2d_bwd_data 1 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log -./profile_conv.sh conv2d_bwd_data 2 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log -./profile_conv.sh conv2d_bwd_data 3 1 1 1 0 2 0 5 128 | tee -a $bwd_conv_log -python3 process_perf_data.py $bwd_conv_log +export bwd_conv_log="perf_bwd_conv_${gpu_arch}.log" +print_log_header $bwd_conv_log $env_type $branch $host_name +./profile_conv.sh conv2d_bwd_data 0 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log +./profile_conv.sh conv2d_bwd_data 1 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log +./profile_conv.sh conv2d_bwd_data 2 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log +./profile_conv.sh conv2d_bwd_data 3 1 1 1 $verify 2 0 5 128 | tee -a $bwd_conv_log #run fusion tests -export fusion_log="perf_fusion.log" -print_log_header $fusion_log $env_type -./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 0 0 2 0 5 | tee -a $fusion_log -./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 1 0 2 0 5 | tee -a $fusion_log -./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 2 0 2 0 5 | tee -a $fusion_log -./profile_gemm_bias_relu_add.sh gemm_bias_relu_add 1 3 0 2 0 5 | tee -a $fusion_log -python3 process_perf_data.py $fusion_log +export fusion_log="perf_fusion_${gpu_arch}.log" +print_log_header $fusion_log $env_type $branch $host_name +./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 2 0 1 | tee -a $fusion_log +./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 2 0 1 | tee -a $fusion_log +./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 2 0 1 | tee -a $fusion_log +./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 2 0 1 | tee -a $fusion_log #run reduction tests -export reduction_log="perf_reduction.log" -print_log_header $reduction_log $env_type -./profile_reduce_with_index.sh 0 2 10 --half | tee -a $reduction_log -./profile_reduce_no_index.sh 0 2 10 --half | tee -a $reduction_log -python3 process_perf_data.py $reduction_log \ No newline at end of file +export reduction_log="perf_reduction_${gpu_arch}.log" +print_log_header $reduction_log $env_type $branch $host_name +./profile_reduce_with_index.sh $verify 2 10 --half | tee -a $reduction_log +./profile_reduce_no_index.sh $verify 2 10 --half | tee -a $reduction_log diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh index 857b2ac9b4..2fbe0d8b31 100755 --- a/script/run_performance_tests.sh +++ b/script/run_performance_tests.sh @@ -1,59 +1,62 @@ #!/bin/bash # # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ -# and make sure the following python packages are installed in your environment: +# run the script as "./run_performance_tests.sh < node name> +# input arguments: +# verification = 0 : do not verify result correctness on CPU +# = 1 : verify correctness on CPU (may take a long time) +# environment tag : a string describing the specifics of your test environment +# gpu_arch : a string for GPU architecture, e.g. "gfx908" or "gfx90a". +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# node name : $hostname -pip3 install --upgrade pip -pip3 install sqlalchemy pymysql pandas sshtunnel - -# you would also need to set up some environment variables in order to -# post your new test results to the database and compare them to the baseline -# please contact Illia.Silin@amd.com for more details -# -# run the script as "./run_performance_tests.sh - -#get the test environment type: -export env_type=$1 -echo 'Environment type ' $env_type +#get the command line arguments: +export verify=$1 +echo 'Verification: ' $verify +export env_type=$2 +echo 'Environment type: ' $env_type +export gpu_arch=$3 +echo 'GPU architecture: ' $gpu_arch +export branch=$4 +echo 'Branch name: ' $branch +export host_name=$5 +echo 'Host name: ' $host_name function print_log_header(){ rm -f $1; - git status | grep -e 'On branch' > $1; - echo -n 'Node name: ' >>$1; hostname >> $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; #get GPU_arch and number of compute units from rocminfo echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; rocminfo | grep "Compute Unit:" >> $1; hipcc --version | grep -e 'HIP version' >> $1; - echo 'Environment type: ' $2 >>$1; + echo 'Environment type: ' $2 >> $1; /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; } #run gemm tests -export gemm_log="perf_gemm.log" -print_log_header $gemm_log $env_type -./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log -./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log -python3 process_perf_data.py $gemm_log +export gemm_log="perf_gemm_${gpu_arch}.log" +print_log_header $gemm_log $env_type $branch $host_name +./profile_gemm.sh gemm 0 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 0 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 1 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 2 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 3 $verify 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 3 $verify 1 0 5 | tee -a $gemm_log #run resnet50 test -export resnet256_log="perf_resnet50_N256.log" -print_log_header $resnet256_log $env_type -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a $resnet256_log -python3 process_perf_data.py $resnet256_log -export resnet4_log="perf_resnet50_N4.log" -print_log_header $resnet4_log $env_type -./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a $resnet4_log -python3 process_perf_data.py $resnet4_log +export resnet256_log="perf_resnet50_N256_${gpu_arch}.log" +print_log_header $resnet256_log $env_type $branch $host_name +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 256 | tee -a $resnet256_log +export resnet4_log="perf_resnet50_N4_${gpu_arch}.log" +print_log_header $resnet4_log $env_type $branch $host_name +./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 2 0 1 4 | tee -a $resnet4_log diff --git a/test/conv2d_bwd_weight/CMakeLists.txt b/test/conv2d_bwd_weight/CMakeLists.txt index e61c9299c8..0acd546830 100644 --- a/test/conv2d_bwd_weight/CMakeLists.txt +++ b/test/conv2d_bwd_weight/CMakeLists.txt @@ -1,2 +1,2 @@ -add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) -target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util) +#add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) +#target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)