diff --git a/Jenkinsfile b/Jenkinsfile index 53b8d26636..beac2ea248 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -100,35 +100,44 @@ def buildHipClangJob(Map conf=[:]){ def variant = env.STAGE_NAME - def retimage gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { - try { - retimage = docker.build("${image}", dockerArgs + '.') - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES') - { - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + if (params.USE_DOCKERFILE){ + try { + retimage = docker.build("${image}", dockerArgs + '.') + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + catch(Exception ex) { + retimage = docker.build("${image}", dockerArgs + "--no-cache .") + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } } } } - catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ - echo "The job was cancelled or aborted" - throw e - } - catch(Exception ex) { - retimage = docker.build("${image}", dockerArgs + "--no-cache .") - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES') - { - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' - } + else{ + timeout(time: 3, unit: 'HOURS'){ + retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull() + image="b56f8ac0d6ea" + sh "docker images" } } withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { timeout(time: 5, unit: 'HOURS') { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' cmake_build(conf) } } @@ -181,29 +190,37 @@ def runCKProfiler(Map conf=[:]){ def variant = env.STAGE_NAME - def retimage gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') { - try { - retimage = docker.build("${image}", dockerArgs + '.') - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES') - { - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + if (params.USE_DOCKERFILE){ + try { + retimage = docker.build("${image}", dockerArgs + '.') + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + catch(Exception ex) { + retimage = docker.build("${image}", dockerArgs + "--no-cache .") + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES') + { + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' + } } } } - catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ - echo "The job was cancelled or aborted" - throw e - } - catch(Exception ex) { - retimage = docker.build("${image}", dockerArgs + "--no-cache .") - withDockerContainer(image: image, args: dockerOpts) { - timeout(time: 5, unit: 'MINUTES') - { - sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo' - } + else{ + timeout(time: 3, unit: 'HOURS'){ + retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull() + image="b56f8ac0d6ea" + sh "docker images" } } @@ -217,7 +234,8 @@ def runCKProfiler(Map conf=[:]){ sh "rm -f ${gemm_log}" sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}" sh "echo Node name: ${NODE_NAME} >> ${gemm_log}" - sh "echo GPU_arch: ${gpu_arch} >> ${gemm_log}" + sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}" + sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} " sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}" sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}" sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}" @@ -246,7 +264,8 @@ def runCKProfiler(Map conf=[:]){ sh "rm -f ${resnet_log}" sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}" sh "echo Node name: ${NODE_NAME} >> ${resnet_log}" - sh "echo GPU_arch: ${gpu_arch} >> ${resnet_log}" + sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}" + sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} " sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}" sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}" //first run tests with N=256 @@ -285,9 +304,20 @@ pipeline { options { parallelsAlwaysFailFast() } - // environment{ - // variable = value - // } + parameters { + booleanParam( + name: "USE_DOCKERFILE", + defaultValue: true, + description: "") + } + environment{ + dbuser = "${dbuser}" + dbpassword = "${dbpassword}" + dbsship = "${dbsship}" + dbsshport = "${dbsshport}" + dbsshuser = "${dbsshuser}" + dbsshpassword = "${dbsshpassword}" + } stages{ stage("Static checks") { parallel{ @@ -302,30 +332,6 @@ pipeline { // buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug') // } // } - // we will build and run ckProfiler release version later, during the performance test stage - //stage('Build Profiler: Release, gfx908') - //{ - // agent { label rocmnode("nogpu")} - // environment{ - // setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ - // } - // steps{ - // buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') - // } - //} - //stage('Build Profiler: Debug, gfx908') - //{ - // agent { label rocmnode("nogpu")} - // environment{ - // setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ - // } - // steps{ - // // until we stabilize debug build due to compiler crashes - // catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - // buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug') - // } - // } - //} stage('Clang Format') { agent{ label rocmnode("nogpu") } environment{ @@ -353,12 +359,11 @@ pipeline { { agent{ label rocmnode("gfx908")} environment{ - setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ + setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release') + buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908") } - } stage("Run Tests: gfx90a") { @@ -367,11 +372,9 @@ pipeline { setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release') + buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a") } - } - } } stage("Client App") @@ -400,33 +403,37 @@ pipeline { agent{ label rocmnode("gfx908")} environment{ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ - dbuser = "${dbuser}" - dbpassword = "${dbpassword}" - dbsship = "${dbsship}" - dbsshport = "${dbsshport}" - dbsshuser = "${dbsshuser}" - dbsshpassword = "${dbsshpassword}" } steps{ - runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') + runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908") + } + } + stage("Run ckProfiler: gfx90a") + { + agent{ label rocmnode("gfx90a")} + environment{ + setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """ + } + steps{ + runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a") } } } } - - // enable after the cmake file supports packaging - // stage("Packages") { - // when { - // expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } - // } - // parallel { - // stage("Package /opt/rocm") { - // agent{ label rocmnode("nogpu") } - // steps{ - // buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") - // } - // } - // } - // } + /* enable after the cmake file supports packaging + stage("Packages") { + when { + expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA } + } + parallel { + stage("Package /opt/rocm") { + agent{ label rocmnode("nogpu") } + steps{ + buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a") + } + } + } + } + */ } } diff --git a/script/parse_perf_data.py b/script/parse_perf_data.py index 1ec7ae01a7..4cb13e6243 100644 --- a/script/parse_perf_data.py +++ b/script/parse_perf_data.py @@ -52,21 +52,28 @@ def main(): if 'Branch name' in line: lst=line.split() branch_name=lst[2] + if 'On branch' in line: + lst=line.split() + branch_name=lst[2] if 'Node name' in line: lst=line.split() node_id=lst[2] if 'GPU_arch' in line: lst=line.split() - gpu_arch=lst[1] + gpu_arch=lst[2] if 'HIP version' in line: lst=line.split() hip_vers=lst[2] + if 'Compute Unit' in line: + lst=line.split() + compute_units=lst[2] if 'InstalledDir' in line: lst=line.split() rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')] print("Branch name:",branch_name) print("Node name:",node_id) print("GPU_arch:",gpu_arch) + print("Compute units:",compute_units) print("ROCM_version:",rocm_vers) print("HIP_version:",hip_vers) @@ -188,8 +195,8 @@ def main(): testlist=[] for i in range(1,len(tests)+1): testlist.append("Test%i"%i) - ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())] - flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime']) + ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())] + flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime']) df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist) flops=pd.concat([flops,df_add],axis=1) print("new tflops for gemm tests:",flops) @@ -207,8 +214,8 @@ def main(): testlist=[] for i in range(1,50): testlist.append("Layer%i"%i) - ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())] - flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime']) + ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())] + flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime']) df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist) flops=pd.concat([flops0,df_add],axis=1) print("new tflops for N=256 resnet50 test:",flops) diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh new file mode 100644 index 0000000000..6c96a9449d --- /dev/null +++ b/script/run_performance_tests.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/ +# and make sure the following python packages are installed in your environment: +# pip3 install --upgrade pip +# pip3 install sqlalchemy +# pip3 install pymysql +# pip3 install pandas +# pip3 install sshtunnel +# you would also need to set up some environment variables in order to +# post your new test results to the database and compare them to the baseline +# please contact Illia.Silin@amd.com for more details +# + +export gemm_log="perf_gemm.log" +rm -f $gemm_log +git status | grep -e 'On branch' > ${gemm_log} +echo -n 'Node name: ' >>${gemm_log}; hostname >> ${gemm_log} +#get GPU_arch and number of compute units from rocminfo +echo -n "GPU_arch: " >> ${gemm_log}; rocminfo | grep "Name:" | grep "gfx" >> ${gemm_log} +rocminfo | grep "Compute Unit:" >> ${gemm_log} +hipcc --version | grep -e 'HIP version' >> ${gemm_log} +/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log} +./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log} +./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log +./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log + +python3 parse_perf_data.py ${gemm_log} + +#run resnet50 test +export resnet_log="perf_resnet50.log" +rm -f $resnet_log +git status | grep -e 'On branch' > ${resnet_log} +echo -n 'Node name: '>>${resnet_log}; hostname >>${resnet_log} +#get GPU_arch and number of compute units from rocminfo +echo -n "GPU_arch: " >> ${resnet_log}; rocminfo | grep "Name:" | grep "gfx" >> ${resnet_log} +rocminfo | grep "Compute Unit:" >> ${resnet_log} +hipcc --version | grep -e 'HIP version' >> ${resnet_log} +/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log} +#first run tests with N=256 +./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log} +#then run with N=4 +./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log} +#the script will put the results from N=256 and N=4 runs into separate tables +python3 parse_perf_data.py ${resnet_log}