mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Add performance tests on MI200 in CI, reporting number of CUs, add stand-alone perf test. (#277)
* use pre-built docker instead of building a new one
* try docker.image.pull
* change syntax in docker.image()
* add 30 min timeout
* increase timeout to 3 hours
* move performance tests to first stage for testing
* set image variable to the new container name
* update image name
* check available images
* check available images in both places
* try different image name
* use image ID to refer to image
* run performance on gfx90a
* fix the gpu_arch labeling, add parameter
* move env vars out of stages
* add stand-alone performance script, MI200 tests, CU numbers
[ROCm/composable_kernel commit: 1ced00a577]
This commit is contained in:
193
Jenkinsfile
vendored
193
Jenkinsfile
vendored
@@ -100,35 +100,44 @@ def buildHipClangJob(Map conf=[:]){
|
||||
|
||||
def variant = env.STAGE_NAME
|
||||
|
||||
|
||||
def retimage
|
||||
gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
try {
|
||||
retimage = docker.build("${image}", dockerArgs + '.')
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
if (params.USE_DOCKERFILE){
|
||||
try {
|
||||
retimage = docker.build("${image}", dockerArgs + '.')
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
|
||||
echo "The job was cancelled or aborted"
|
||||
throw e
|
||||
}
|
||||
catch(Exception ex) {
|
||||
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
|
||||
echo "The job was cancelled or aborted"
|
||||
throw e
|
||||
}
|
||||
catch(Exception ex) {
|
||||
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
else{
|
||||
timeout(time: 3, unit: 'HOURS'){
|
||||
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
|
||||
image="b56f8ac0d6ea"
|
||||
sh "docker images"
|
||||
}
|
||||
}
|
||||
|
||||
withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
|
||||
timeout(time: 5, unit: 'HOURS')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
cmake_build(conf)
|
||||
}
|
||||
}
|
||||
@@ -181,29 +190,37 @@ def runCKProfiler(Map conf=[:]){
|
||||
|
||||
def variant = env.STAGE_NAME
|
||||
|
||||
|
||||
def retimage
|
||||
gitStatusWrapper(credentialsId: '7126e5fe-eb51-4576-b52b-9aaf1de8f0fd', gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
|
||||
try {
|
||||
retimage = docker.build("${image}", dockerArgs + '.')
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
if (params.USE_DOCKERFILE){
|
||||
try {
|
||||
retimage = docker.build("${image}", dockerArgs + '.')
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
|
||||
echo "The job was cancelled or aborted"
|
||||
throw e
|
||||
}
|
||||
catch(Exception ex) {
|
||||
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
|
||||
echo "The job was cancelled or aborted"
|
||||
throw e
|
||||
}
|
||||
catch(Exception ex) {
|
||||
retimage = docker.build("${image}", dockerArgs + "--no-cache .")
|
||||
withDockerContainer(image: image, args: dockerOpts) {
|
||||
timeout(time: 5, unit: 'MINUTES')
|
||||
{
|
||||
sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo'
|
||||
}
|
||||
else{
|
||||
timeout(time: 3, unit: 'HOURS'){
|
||||
retimage = docker.image('compute-artifactory.amd.com:5000/rocm-plus-docker/framework/compute-rocm-dkms-no-npi-hipclang:9110_ubuntu18.04_py3.6_pytorch_rocm5.0_internal_testing_7ff5b54').pull()
|
||||
image="b56f8ac0d6ea"
|
||||
sh "docker images"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,7 +234,8 @@ def runCKProfiler(Map conf=[:]){
|
||||
sh "rm -f ${gemm_log}"
|
||||
sh "echo Branch name: ${env.BRANCH_NAME} > ${gemm_log}"
|
||||
sh "echo Node name: ${NODE_NAME} >> ${gemm_log}"
|
||||
sh "echo GPU_arch: ${gpu_arch} >> ${gemm_log}"
|
||||
sh "echo GPU_arch name: ${gpu_arch} >> ${gemm_log}"
|
||||
sh "rocminfo | grep 'Compute Unit:' >> ${gemm_log} "
|
||||
sh "hipcc --version | grep -e 'HIP version' >> ${gemm_log}"
|
||||
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}"
|
||||
sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}"
|
||||
@@ -246,7 +264,8 @@ def runCKProfiler(Map conf=[:]){
|
||||
sh "rm -f ${resnet_log}"
|
||||
sh "echo Branch name: ${env.BRANCH_NAME} > ${resnet_log}"
|
||||
sh "echo Node name: ${NODE_NAME} >> ${resnet_log}"
|
||||
sh "echo GPU_arch: ${gpu_arch} >> ${resnet_log}"
|
||||
sh "echo GPU_arch name: ${gpu_arch} >> ${resnet_log}"
|
||||
sh "rocminfo | grep 'Compute Unit:' >> ${resnet_log} "
|
||||
sh "hipcc --version | grep -e 'HIP version' >> ${resnet_log}"
|
||||
sh "/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}"
|
||||
//first run tests with N=256
|
||||
@@ -285,9 +304,20 @@ pipeline {
|
||||
options {
|
||||
parallelsAlwaysFailFast()
|
||||
}
|
||||
// environment{
|
||||
// variable = value
|
||||
// }
|
||||
parameters {
|
||||
booleanParam(
|
||||
name: "USE_DOCKERFILE",
|
||||
defaultValue: true,
|
||||
description: "")
|
||||
}
|
||||
environment{
|
||||
dbuser = "${dbuser}"
|
||||
dbpassword = "${dbpassword}"
|
||||
dbsship = "${dbsship}"
|
||||
dbsshport = "${dbsshport}"
|
||||
dbsshuser = "${dbsshuser}"
|
||||
dbsshpassword = "${dbsshpassword}"
|
||||
}
|
||||
stages{
|
||||
stage("Static checks") {
|
||||
parallel{
|
||||
@@ -302,30 +332,6 @@ pipeline {
|
||||
// buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
|
||||
// }
|
||||
// }
|
||||
// we will build and run ckProfiler release version later, during the performance test stage
|
||||
//stage('Build Profiler: Release, gfx908')
|
||||
//{
|
||||
// agent { label rocmnode("nogpu")}
|
||||
// environment{
|
||||
// setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
|
||||
// }
|
||||
// steps{
|
||||
// buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
|
||||
// }
|
||||
//}
|
||||
//stage('Build Profiler: Debug, gfx908')
|
||||
//{
|
||||
// agent { label rocmnode("nogpu")}
|
||||
// environment{
|
||||
// setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
|
||||
// }
|
||||
// steps{
|
||||
// // until we stabilize debug build due to compiler crashes
|
||||
// catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
|
||||
// buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug')
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
stage('Clang Format') {
|
||||
agent{ label rocmnode("nogpu") }
|
||||
environment{
|
||||
@@ -353,12 +359,11 @@ pipeline {
|
||||
{
|
||||
agent{ label rocmnode("gfx908")}
|
||||
environment{
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
|
||||
}
|
||||
|
||||
}
|
||||
stage("Run Tests: gfx90a")
|
||||
{
|
||||
@@ -367,11 +372,9 @@ pipeline {
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
stage("Client App")
|
||||
@@ -400,33 +403,37 @@ pipeline {
|
||||
agent{ label rocmnode("gfx908")}
|
||||
environment{
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
|
||||
dbuser = "${dbuser}"
|
||||
dbpassword = "${dbpassword}"
|
||||
dbsship = "${dbsship}"
|
||||
dbsshport = "${dbsshport}"
|
||||
dbsshuser = "${dbsshuser}"
|
||||
dbsshpassword = "${dbsshpassword}"
|
||||
}
|
||||
steps{
|
||||
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
|
||||
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx908")
|
||||
}
|
||||
}
|
||||
stage("Run ckProfiler: gfx90a")
|
||||
{
|
||||
agent{ label rocmnode("gfx90a")}
|
||||
environment{
|
||||
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
|
||||
}
|
||||
steps{
|
||||
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release', gpu_arch: "gfx90a")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// enable after the cmake file supports packaging
|
||||
// stage("Packages") {
|
||||
// when {
|
||||
// expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
|
||||
// }
|
||||
// parallel {
|
||||
// stage("Package /opt/rocm") {
|
||||
// agent{ label rocmnode("nogpu") }
|
||||
// steps{
|
||||
// buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
/* enable after the cmake file supports packaging
|
||||
stage("Packages") {
|
||||
when {
|
||||
expression { params.BUILD_PACKAGES && params.TARGET_NOGPU && params.DATATYPE_NA }
|
||||
}
|
||||
parallel {
|
||||
stage("Package /opt/rocm") {
|
||||
agent{ label rocmnode("nogpu") }
|
||||
steps{
|
||||
buildHipClangJobAndReboot( package_build: "true", prefixpath: '/opt/rocm', gpu_arch: "gfx906;gfx908;gfx90a")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,21 +52,28 @@ def main():
|
||||
if 'Branch name' in line:
|
||||
lst=line.split()
|
||||
branch_name=lst[2]
|
||||
if 'On branch' in line:
|
||||
lst=line.split()
|
||||
branch_name=lst[2]
|
||||
if 'Node name' in line:
|
||||
lst=line.split()
|
||||
node_id=lst[2]
|
||||
if 'GPU_arch' in line:
|
||||
lst=line.split()
|
||||
gpu_arch=lst[1]
|
||||
gpu_arch=lst[2]
|
||||
if 'HIP version' in line:
|
||||
lst=line.split()
|
||||
hip_vers=lst[2]
|
||||
if 'Compute Unit' in line:
|
||||
lst=line.split()
|
||||
compute_units=lst[2]
|
||||
if 'InstalledDir' in line:
|
||||
lst=line.split()
|
||||
rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
|
||||
print("Branch name:",branch_name)
|
||||
print("Node name:",node_id)
|
||||
print("GPU_arch:",gpu_arch)
|
||||
print("Compute units:",compute_units)
|
||||
print("ROCM_version:",rocm_vers)
|
||||
print("HIP_version:",hip_vers)
|
||||
|
||||
@@ -188,8 +195,8 @@ def main():
|
||||
testlist=[]
|
||||
for i in range(1,len(tests)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
|
||||
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
|
||||
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
|
||||
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
|
||||
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
|
||||
flops=pd.concat([flops,df_add],axis=1)
|
||||
print("new tflops for gemm tests:",flops)
|
||||
@@ -207,8 +214,8 @@ def main():
|
||||
testlist=[]
|
||||
for i in range(1,50):
|
||||
testlist.append("Layer%i"%i)
|
||||
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
|
||||
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
|
||||
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
|
||||
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
|
||||
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
|
||||
flops=pd.concat([flops0,df_add],axis=1)
|
||||
print("new tflops for N=256 resnet50 test:",flops)
|
||||
|
||||
58
script/run_performance_tests.sh
Normal file
58
script/run_performance_tests.sh
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
|
||||
# and make sure the following python packages are installed in your environment:
|
||||
# pip3 install --upgrade pip
|
||||
# pip3 install sqlalchemy
|
||||
# pip3 install pymysql
|
||||
# pip3 install pandas
|
||||
# pip3 install sshtunnel
|
||||
# you would also need to set up some environment variables in order to
|
||||
# post your new test results to the database and compare them to the baseline
|
||||
# please contact Illia.Silin@amd.com for more details
|
||||
#
|
||||
|
||||
export gemm_log="perf_gemm.log"
|
||||
rm -f $gemm_log
|
||||
git status | grep -e 'On branch' > ${gemm_log}
|
||||
echo -n 'Node name: ' >>${gemm_log}; hostname >> ${gemm_log}
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> ${gemm_log}; rocminfo | grep "Name:" | grep "gfx" >> ${gemm_log}
|
||||
rocminfo | grep "Compute Unit:" >> ${gemm_log}
|
||||
hipcc --version | grep -e 'HIP version' >> ${gemm_log}
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}
|
||||
./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}
|
||||
./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
|
||||
./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
|
||||
|
||||
python3 parse_perf_data.py ${gemm_log}
|
||||
|
||||
#run resnet50 test
|
||||
export resnet_log="perf_resnet50.log"
|
||||
rm -f $resnet_log
|
||||
git status | grep -e 'On branch' > ${resnet_log}
|
||||
echo -n 'Node name: '>>${resnet_log}; hostname >>${resnet_log}
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> ${resnet_log}; rocminfo | grep "Name:" | grep "gfx" >> ${resnet_log}
|
||||
rocminfo | grep "Compute Unit:" >> ${resnet_log}
|
||||
hipcc --version | grep -e 'HIP version' >> ${resnet_log}
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}
|
||||
#first run tests with N=256
|
||||
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}
|
||||
#then run with N=4
|
||||
./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}
|
||||
#the script will put the results from N=256 and N=4 runs into separate tables
|
||||
python3 parse_perf_data.py ${resnet_log}
|
||||
Reference in New Issue
Block a user