diff --git a/Jenkinsfile b/Jenkinsfile index 58cd72c8ce..0a98cc5c6d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -330,10 +330,8 @@ def cmake_build(Map conf=[:]){ try{ archiveArtifacts "perf_fmha_fwd_*.log" archiveArtifacts "perf_fmha_bwd_*.log" - stash name: "perf_fmha_fwd_gfx942.log" - stash name: "perf_fmha_bwd_gfx942.log" - stash name: "perf_fmha_fwd_gfx90a.log" - stash name: "perf_fmha_bwd_gfx90a.log" + stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" + stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." @@ -408,128 +406,6 @@ def buildHipClangJobAndReboot(Map conf=[:]){ } } -def runCKProfiler(Map conf=[:]){ - show_node_info() - - env.HSA_ENABLE_SDMA=0 - checkout scm - - def image = getDockerImageName() - def prefixpath = conf.get("prefixpath", "/opt/rocm") - - // Jenkins is complaining about the render group - def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - if (conf.get("enforce_xnack_on", false)) { - dockerOpts = dockerOpts + " --env HSA_XNACK=1 " - } - def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') - def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') - dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " - echo "Docker flags: ${dockerOpts}" - - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - - def variant = env.STAGE_NAME - def retimage - - gitStatusWrapper(credentialsId: "${env.ck_git_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCm', repo: 'composable_kernel') { - try { - (retimage, image) = getDockerImage(conf) - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES'){ - sh 'rocminfo | tee rocminfo.log' - if ( !runShell('grep -n "gfx" rocminfo.log') ){ - throw new Exception ("GPU not found") - } - else{ - echo "GPU is OK" - } - } - } - } - catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ - echo "The job was cancelled or aborted" - throw e - } - - withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') - { - sh """ - rm -rf build - mkdir build - """ - dir("build"){ - unstash 'ckProfiler.tar.gz' - sh 'tar -xvf ckProfiler.tar.gz' - } - - dir("script"){ - if (params.RUN_FULL_QA){ - sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - archiveArtifacts "perf_batched_gemm.log" - archiveArtifacts "perf_grouped_gemm.log" - archiveArtifacts "perf_grouped_conv_fwd.log" - archiveArtifacts "perf_grouped_conv_bwd_data.log" - archiveArtifacts "perf_grouped_conv_bwd_weight.log" - archiveArtifacts "perf_gemm_bilinear.log" - archiveArtifacts "perf_reduction.log" - archiveArtifacts "perf_splitK_gemm.log" - archiveArtifacts "perf_onnx_gemm.log" - archiveArtifacts "perf_mixed_gemm.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - stash name: "perf_batched_gemm.log" - stash name: "perf_grouped_gemm.log" - stash name: "perf_grouped_conv_fwd.log" - stash name: "perf_grouped_conv_bwd_data.log" - stash name: "perf_grouped_conv_bwd_weight.log" - stash name: "perf_gemm_bilinear.log" - stash name: "perf_reduction.log" - stash name: "perf_splitK_gemm.log" - stash name: "perf_onnx_gemm.log" - stash name: "perf_mixed_gemm.log" - //we will process results on the master node - } - else{ - sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" - archiveArtifacts "perf_gemm.log" - archiveArtifacts "perf_resnet50_N256.log" - archiveArtifacts "perf_resnet50_N4.log" - // stash perf files to master - stash name: "perf_gemm.log" - stash name: "perf_resnet50_N256.log" - stash name: "perf_resnet50_N4.log" - //we will process the results on the master node - } - } - } - } - } - return retimage -} - -def runPerfTest(Map conf=[:]){ - try{ - runCKProfiler(conf) - } - catch(e){ - echo "throwing error exception in performance tests" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - if (!conf.get("no_reboot", false)) { - reboot() - } - } -} - def Build_CK(Map conf=[:]){ show_node_info() @@ -589,36 +465,95 @@ def Build_CK(Map conf=[:]){ throw e } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { - timeout(time: 24, unit: 'HOURS') + timeout(time: 12, unit: 'HOURS') { //check whether to run performance tests on this node - def do_perf_tests = 0 + def arch_type = 0 sh 'rocminfo | tee rocminfo.log' - if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx1201" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ - do_perf_tests = 1 - echo "Stash profiler and run performance tests" + if ( runShell('grep -n "gfx90a" rocminfo.log') ){ + arch_type = 1 + } + else if ( runShell('grep -n "gfx942" rocminfo.log') ) { + arch_type = 2 + } + else if ( runShell('grep -n "gfx1030" rocminfo.log') ) { + arch_type = 3 + } + else if ( runShell('grep -n "gfx1101" rocminfo.log') ) { + arch_type = 4 + } + else if ( runShell('grep -n "gfx1201" rocminfo.log') ) { + arch_type = 5 } cmake_build(conf) dir("build"){ - //run tests and examples - //sh 'make -j check' - if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ - //we only need the ckProfiler to run the performance tests, so we pack and stash it - //do not stash profiler on nodes where we don't need to run performance tests - sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' - stash name: "ckProfiler.tar.gz" - } - if (params.RUN_FULL_QA && do_perf_tests == 0 ){ - // build deb packages for all gfx9 targets and prepare to export + if (params.RUN_FULL_QA && arch_type == 1 ){ + // build deb packages for all gfx9 targets on gfx90a system and prepare to export + echo "Build ckProfiler package" sh 'make -j package' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' - archiveArtifacts artifacts: 'composablekernel-tests_*.deb' sh 'mv composablekernel-ckprofiler_*.deb ckprofiler_0.2.0_amd64.deb' - stash name: "ckprofiler_0.2.0_amd64.deb" + stash includes: "ckprofiler_0.2.0_amd64.deb", name: "ckprofiler_0.2.0_amd64.deb" } } - if (params.hipTensor_test && do_perf_tests == 0 ){ - //build and test hipTensor + // run performance tests, stash the logs, results will be processed on the master node + dir("script"){ + if (params.RUN_PERFORMANCE_TESTS){ + if (params.RUN_FULL_QA && arch_type == 1){ + // run full tests on gfx90a + echo "Run full performance tests" + sh "./run_full_performance_tests.sh 0 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + archiveArtifacts "perf_batched_gemm.log" + archiveArtifacts "perf_grouped_gemm.log" + archiveArtifacts "perf_grouped_conv_fwd.log" + archiveArtifacts "perf_grouped_conv_bwd_data.log" + archiveArtifacts "perf_grouped_conv_bwd_weight.log" + archiveArtifacts "perf_gemm_bilinear.log" + archiveArtifacts "perf_reduction.log" + archiveArtifacts "perf_splitK_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_mixed_gemm.log" + stash includes: "perf_**.log", name: "perf_log" + } + else if ( arch_type == 1 ){ + // run standard tests on gfx90a + echo "Run performance tests" + sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}" + archiveArtifacts "perf_gemm.log" + archiveArtifacts "perf_onnx_gemm.log" + archiveArtifacts "perf_resnet50_N256.log" + archiveArtifacts "perf_resnet50_N4.log" + stash includes: "perf_**.log", name: "perf_log" + } + // disable performance tests on gfx1030 for now. + //else if ( arch_type == 3){ + // run basic tests on gfx1030 + // echo "Run gemm performance tests" + // sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx10" + // archiveArtifacts "perf_onnx_gemm_gfx10.log" + // stash includes: "perf_onnx_gemm_gfx10.log", name: "perf_log_gfx10" + //} + else if ( arch_type == 4){ + // run basic tests on gfx11 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx11" + archiveArtifacts "perf_onnx_gemm_gfx11.log" + stash includes: "perf_onnx_gemm_gfx11.log", name: "perf_log_gfx11" + } + else if ( arch_type == 5 ){ + // run basic tests on gfx12 + echo "Run gemm performance tests" + sh "./run_gemm_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME} gfx12" + archiveArtifacts "perf_onnx_gemm_gfx12.log" + stash includes: "perf_onnx_gemm_gfx12.log", name: "perf_log_gfx12" + } + } + } + if (params.hipTensor_test && arch_type == 1 ){ + // build and test hipTensor on gfx90a node sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip rm -rf hipTensor-"${params.hipTensor_branch}" @@ -690,10 +625,8 @@ def process_results(Map conf=[:]){ dir("script"){ if (params.RUN_CK_TILE_FMHA_TESTS){ try{ - unstash "perf_fmha_fwd_gfx942.log" - unstash "perf_fmha_bwd_gfx942.log" - unstash "perf_fmha_fwd_gfx90a.log" - unstash "perf_fmha_bwd_gfx90a.log" + unstash "perf_fmha_log_gfx942" + unstash "perf_fmha_log_gfx90a" } catch(Exception err){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." @@ -703,26 +636,26 @@ def process_results(Map conf=[:]){ // unstash perf files to master unstash "ckprofiler_0.2.0_amd64.deb" sh "sshpass -p ${env.ck_deb_pw} scp -o StrictHostKeyChecking=no ckprofiler_0.2.0_amd64.deb ${env.ck_deb_user}@${env.ck_deb_ip}:/var/www/html/composable_kernel/" - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" - unstash "perf_batched_gemm.log" - unstash "perf_grouped_gemm.log" - unstash "perf_grouped_conv_fwd.log" - unstash "perf_grouped_conv_bwd_data.log" - unstash "perf_grouped_conv_bwd_weight.log" - unstash "perf_gemm_bilinear.log" - unstash "perf_reduction.log" - unstash "perf_splitK_gemm.log" - unstash "perf_onnx_gemm.log" - unstash "perf_mixed_gemm.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_qa_data.sh" } else{ // unstash perf files to master - unstash "perf_gemm.log" - unstash "perf_resnet50_N256.log" - unstash "perf_resnet50_N4.log" + unstash "perf_log" + try{ + unstash "perf_log_gfx11" + unstash "perf_log_gfx12" + } + catch(Exception err){ + echo "could not locate the GEMM gfx11/gfx12 performance logs: ${err.getMessage()}." + } sh "./process_perf_data.sh" } } @@ -1241,29 +1174,6 @@ pipeline { } } } - - stage("Performance Tests") - { - parallel - { - stage("Run ckProfiler: gfx90a") - { - when { - beforeAgent true - expression { params.RUN_PERFORMANCE_TESTS.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() } - } - options { retry(1) } - agent{ label rocmnode("gfx90a")} - environment{ - setup_args = "NO_CK_BUILD" - } - steps{ - runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') - cleanWs() - } - } - } - } stage("Process Performance Test Results") { parallel diff --git a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt index a53fde1662..0457588ea6 100644 --- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt @@ -6,7 +6,7 @@ set(CK_TILE_SRC_FOLDER ${CMAKE_SOURCE_DIR}/include/ck_tile/) # CK Codegen requires dataclass which is added in Python 3.7 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 if(NOT CK_USE_ALTERNATIVE_PYTHON) - find_package(PythonInterp 3 REQUIRED) + find_package(Python3 COMPONENTS Interpreter Development) else() message("Using alternative python version") set(EXTRA_PYTHON_PATH) @@ -33,7 +33,7 @@ set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd") # Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time. # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. execute_process( - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt --api ${FMHA_KNOWN_APIS} --receipt 3 @@ -50,7 +50,7 @@ endif() # With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad. add_custom_command( OUTPUT ${FMHA_GEN_BLOBS} - COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py + COMMAND ${Python3_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py --output_dir ${FMHA_CPP_FOLDER} --api ${FMHA_KNOWN_APIS} --receipt 3 diff --git a/script/process_perf_data.py b/script/process_perf_data.py index 3892206e42..fbfec94eef 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -82,7 +82,7 @@ def parse_logfile(logfile): StrideA=[] StrideB=[] StrideC=[] - if 'perf_gemm.log' in logfile: + if 'perf_gemm' in logfile and 'gemm_bilinear' not in logfile: for line in open(logfile): if 'Best Perf' in line: lst=line.split() @@ -260,7 +260,7 @@ def main(): conn = sqlEngine.connect() #save gemm performance tests: - if 'perf_gemm.log' in filename: + if 'perf_gemm' in filename and 'gemm_bilinear' not in filename: #write the ck_gemm_test_params table only needed once the test set changes #post_test_params(test_list,conn) for i in range(1,len(results)+1): diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh index af1e7e7a0d..ae93463204 100755 --- a/script/process_perf_data.sh +++ b/script/process_perf_data.sh @@ -11,9 +11,22 @@ #process results python3 process_perf_data.py perf_gemm.log +python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N4.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index c9a1645f6e..fb8fe01c6e 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -24,6 +24,18 @@ python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_onnx_gemm.log python3 process_perf_data.py perf_mixed_gemm.log +file=./perf_onnx_gemm_gfx10.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx10.log +fi +file=./perf_onnx_gemm_gfx11.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx11.log +fi +file=./perf_onnx_gemm_gfx12.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_onnx_gemm_gfx12.log +fi file=./perf_fmha_fwd_gfx942.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_fwd_gfx942.log diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh index e167ce012b..ddc5c270b8 100755 --- a/script/run_full_performance_tests.sh +++ b/script/run_full_performance_tests.sh @@ -5,7 +5,7 @@ # post your new test results to the database and compare them to the baseline # please contact Illia.Silin@amd.com for more details # -# run the script as "./run_full_performance_tests.sh < node name> +# run the script as "./run_full_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verifuy correctness on CPU (may take a long time) diff --git a/script/run_gemm_performance_tests.sh b/script/run_gemm_performance_tests.sh new file mode 100755 index 0000000000..12adad30f8 --- /dev/null +++ b/script/run_gemm_performance_tests.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ +# run the script as "./run_gemm_performance_tests.sh +# input arguments: +# verification = 0 : do not verify result correctness on CPU +# = 1 : verify correctness on CPU (may take a long time) +# environment tag : a string describing the specifics of your test environment +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# node name : $hostname +# arch : GPU architecture, e.g. "gfx9" or "gfx1100" + +#get the command line arguments: +export verify=$1 +echo 'Verification: ' $verify +export env_type=$2 +echo 'Environment type: ' $env_type +export branch=$3 +echo 'Branch name: ' $branch +export host_name=$4 +echo 'Host name: ' $host_name +export arch=$5 +echo 'GPU architecture: ' $arch + +function print_log_header(){ + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + #get GPU_arch and number of compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; +} + +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm_$arch.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh index 317d270983..c8a281dc07 100755 --- a/script/run_performance_tests.sh +++ b/script/run_performance_tests.sh @@ -1,7 +1,7 @@ #!/bin/bash # # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ -# run the script as "./run_performance_tests.sh < node name> +# run the script as "./run_performance_tests.sh # input arguments: # verification = 0 : do not verify result correctness on CPU # = 1 : verify correctness on CPU (may take a long time) @@ -51,20 +51,11 @@ print_log_header $gemm_log $env_type $branch $host_name ./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log -#run grouped_fwd fp16 tests -export grouped_conv_fwd_log="perf_grouped_conv_fwd_fp16.log" -print_log_header $conv_fwd_log $env_type $branch $host_name -./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log - -#run grouped_bwd_data fp16 tests -export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data_fp16.log" -print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name -./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log - -#run grouped_bwd_weight fp16 tests -export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight_fp16.log" -print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name -./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 1 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log +#run ONNX gemm tests +export onnx_log="perf_onnx_gemm.log" +print_log_header $onnx_log $env_type $branch $host_name +./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log +./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log #run resnet50 tests export resnet256_log="perf_resnet50_N256.log"