diff --git a/Dockerfile b/Dockerfile index 9a443e01de..79c961144a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,7 +35,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- llvm-amdgpu \ pkg-config \ python \ - python3 \ + python3.8 \ python-dev \ python3-dev \ python-pip \ @@ -72,6 +72,13 @@ ARG PREFIX=/opt/rocm RUN cget install pfultz2/rocm-recipes # Install rbuild RUN pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/6d78a0553babdaea8d2da5de15cbda7e869594b8.tar.gz +# Install packages for processing the performance results +RUN pip3 install --upgrade pip +RUN pip3 install sqlalchemy +RUN pip3 install pymysql +RUN pip3 install pandas +RUN pip3 install setuptools-rust +RUN pip3 install sshtunnel # Setup ubsan environment to printstacktrace ENV UBSAN_OPTIONS=print_stacktrace=1 diff --git a/Jenkinsfile b/Jenkinsfile index 77f4d9d8be..b912062e64 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -213,15 +213,29 @@ def runCKProfiler(Map conf=[:]){ cmake_build(conf) dir("script"){ def perf_log = "perf_gemm_${gpu_arch}.log" - def artifact = "profile_gemm_${gpu_arch}.txt" - sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee ${perf_log} ||true" - sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${perf_log} ||true" - sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${perf_log} ||true" - sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${perf_log} || true" + sh "rm -f ${perf_log}" + sh "echo Branch name: ${env.BRANCH_NAME} > ${perf_log}" + sh "./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a ${perf_log}" + sh "./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a ${perf_log}" //results will be parsed, stored, and analyzed within the python script //the script will return 0 if the performance criteria are met //or return 1 if the criteria are not met - sh "python3 parse_perf_data.py ${perf_log} | tee ${artifact}" + archiveArtifacts "${perf_log}" + sh "python3 parse_perf_data.py ${perf_log} " } } } @@ -246,7 +260,6 @@ def runPerfTest(Map conf=[:]){ } } - pipeline { agent none options { @@ -280,19 +293,19 @@ pipeline { // buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') // } //} - stage('Build Profiler: Debug, gfx908') - { - agent { label rocmnode("nogpu")} - environment{ - setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ - } - steps{ - // until we stabilize debug build due to compiler crashes - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug') - } - } - } + //stage('Build Profiler: Debug, gfx908') + //{ + // agent { label rocmnode("nogpu")} + // environment{ + // setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ + // } + // steps{ + // // until we stabilize debug build due to compiler crashes + // catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { + // buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug') + // } + // } + //} stage('Clang Format') { agent{ label rocmnode("nogpu") } environment{ @@ -312,7 +325,7 @@ pipeline { } } } - stage("Tests") + stage("Tests") { parallel { @@ -367,15 +380,20 @@ pipeline { agent{ label rocmnode("gfx908")} environment{ setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """ - } + dbuser = "${dbuser}" + dbpassword = "${dbpassword}" + dbsship = "${dbsship}" + dbsshport = "${dbsshport}" + dbsshuser = "${dbsshuser}" + dbsshpassword = "${dbsshpassword}" + } steps{ runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release') } - } - } } + // enable after the cmake file supports packaging // stage("Packages") { // when { diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 45e6174260..958d8426c2 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -1,5 +1,7 @@ #pragma once #include +#include +#include #include "check_err.hpp" #include "config.hpp" @@ -527,8 +529,45 @@ void profile_gemm_impl(int do_verification, } } - std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " - << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; + if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f32"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = f16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = bf16"; + } + else if constexpr(is_same::value) + { + std::cout << "Best Perf for datatype = int8"; + } + + if constexpr(is_same::value) + { + std::cout << " ALayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " ALayout = ColumnMajor"; + } + + if constexpr(is_same::value) + { + std::cout << " BLayout = RowMajor"; + } + else if constexpr(is_same::value) + { + std::cout << " BLayout = ColumnMajor"; + } + + std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA + << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time + << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " + << best_gemm_name << std::endl; } } // namespace profiler diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp index db5486e0ac..fbdc07c3da 100644 --- a/profiler/src/profile_batched_gemm.cpp +++ b/profiler/src/profile_batched_gemm.cpp @@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[]) throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp index f67e561865..594fc6bedb 100644 --- a/profiler/src/profile_batched_gemm_reduce.cpp +++ b/profiler/src/profile_batched_gemm_reduce.cpp @@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) throw std::runtime_error("wrong! this data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_conv_bwd_weight.cpp b/profiler/src/profile_conv_bwd_weight.cpp index c022d19ee0..80413322b3 100644 --- a/profiler/src/profile_conv_bwd_weight.cpp +++ b/profiler/src/profile_conv_bwd_weight.cpp @@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[]) throw std::runtime_error("wrong! this Conv data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp index 28aa49687f..ca7dc1935a 100644 --- a/profiler/src/profile_conv_fwd_bias_relu.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu.cpp @@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[]) throw std::runtime_error("wrong! data_type & layout for this operator is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp index 7e033a51e2..5d75f5a294 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp @@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[]) throw std::runtime_error("wrong! data_type & layout for this operator is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp index 095536f701..96d3b10ddf 100644 --- a/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp +++ b/profiler/src/profile_conv_fwd_bias_relu_atomic_add.cpp @@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[]) throw std::runtime_error("wrong! data_type & layout for this operator is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp index 722e86c2ea..87778a04a5 100644 --- a/profiler/src/profile_convnd_fwd.cpp +++ b/profiler/src/profile_convnd_fwd.cpp @@ -347,5 +347,5 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) std::to_string(num_dim_spatial)); } - return 1; + return 0; } diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp index 4c6a3b0487..55bc98f4b1 100644 --- a/profiler/src/profile_gemm.cpp +++ b/profiler/src/profile_gemm.cpp @@ -388,5 +388,5 @@ int profile_gemm(int argc, char* argv[]) throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_gemm_bias_2d.cpp b/profiler/src/profile_gemm_bias_2d.cpp index 46d4f90c17..51dba85f32 100644 --- a/profiler/src/profile_gemm_bias_2d.cpp +++ b/profiler/src/profile_gemm_bias_2d.cpp @@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[]) throw std::runtime_error("wrong! this data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_gemm_bias_relu.cpp b/profiler/src/profile_gemm_bias_relu.cpp index 4346650c9f..bf035d9ad9 100644 --- a/profiler/src/profile_gemm_bias_relu.cpp +++ b/profiler/src/profile_gemm_bias_relu.cpp @@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[]) throw std::runtime_error("wrong! this data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_gemm_bias_relu_add.cpp b/profiler/src/profile_gemm_bias_relu_add.cpp index 186f32cf6f..9c324f6cf9 100644 --- a/profiler/src/profile_gemm_bias_relu_add.cpp +++ b/profiler/src/profile_gemm_bias_relu_add.cpp @@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) throw std::runtime_error("wrong! this data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp index 986acaf010..a23967acd7 100644 --- a/profiler/src/profile_gemm_reduce.cpp +++ b/profiler/src/profile_gemm_reduce.cpp @@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[]) throw std::runtime_error("wrong! this data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp index d35484cfae..c3774962cc 100644 --- a/profiler/src/profile_grouped_gemm.cpp +++ b/profiler/src/profile_grouped_gemm.cpp @@ -153,5 +153,5 @@ int profile_grouped_gemm(int argc, char* argv[]) throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); } - return 1; + return 0; } diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 2a8078ca5f..35b0f68628 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -25,7 +25,8 @@ int main(int argc, char* argv[]) { if(strcmp(argv[1], "gemm") == 0) { - return profile_gemm(argc, argv); + int stat = profile_gemm(argc, argv); + return stat; } else if(strcmp(argv[1], "gemm_bias_2d") == 0) { diff --git a/script/parse_perf_data.py b/script/parse_perf_data.py index 3e41f8c4cf..a023a19526 100644 --- a/script/parse_perf_data.py +++ b/script/parse_perf_data.py @@ -1,53 +1,193 @@ -#!/usr/bin/env python3 -import os, io -import argparse - -def print_to_string(*args, **kwargs): - output = io.StringIO() - print(*args, file=output, **kwargs) - contents = output.getvalue() - output.close() - return contents - -def parse_args(): - parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs') - parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files') - args = parser.parse_args() - files = [] - if os.path.isdir(args.filename): - all_files = os.listdir(args.filename) - for name in all_files: - if not 'log' in name: - continue - files.append(os.path.join(args.filename, name)) - else: - files = [args.filename] - args.files = files - return args - -def main(): - args = parse_args() - results = [] - #parse results - glue="" - for filename in args.files: - for line in open(filename): - if 'Best Perf' in line: - lst=line.split() - results.append(print_to_string(glue.join(lst[8:]),lst[4])) - - #sort results - - #read baseline results for the latest develop branch - - #write new results to the db - - #compare the results to the baseline - - #return 0 if performance criteria met, otherwise return 1 - - print(results) - return 0 - -if __name__ == '__main__': +#!/usr/bin/env python3 +import os, io, argparse, datetime +import numpy as np +import sqlalchemy +from sqlalchemy.types import NVARCHAR, Float, Integer +import pymysql +import pandas as pd +from sshtunnel import SSHTunnelForwarder + +def print_to_string(*args, **kwargs): + output = io.StringIO() + print(*args, file=output, **kwargs) + contents = output.getvalue() + output.close() + return contents + +def parse_args(): + parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs') + parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files') + args = parser.parse_args() + files = [] + if os.path.isdir(args.filename): + all_files = os.listdir(args.filename) + for name in all_files: + if not 'log' in name: + continue + files.append(os.path.join(args.filename, name)) + else: + files = [args.filename] + args.files = files + return args + +def main(): + args = parse_args() + tests = [] + kernels=[] + tflops=[] + dtype=[] + alayout=[] + blayout=[] + M=[] + N=[] + K=[] + StrideA=[] + StrideB=[] + StrideC=[] + #parse results, get the Tflops value for "Best Perf" kernels + glue="" + for filename in args.files: + for line in open(filename): + if 'Branch name' in line: + lst=line.split() + branch_name=lst[2] + for filename in args.files: + for line in open(filename): + if 'Best Perf' in line: + lst=line.split() + if len(lst)>=37: #the line is complete + tests.append(glue.join(lst[5:30])) + kernels.append(glue.join(lst[37:])) + tflops.append(lst[33]) + dtype.append(lst[5]) + alayout.append(lst[8]) + blayout.append(lst[11]) + M.append(lst[14]) + N.append(lst[17]) + K.append(lst[20]) + StrideA.append(lst[23]) + StrideB.append(lst[26]) + StrideC.append(lst[29]) + elif len(lst)<37 and len(lst)>=33: #the tflops are available + tests.append(glue.join(lst[5:30])) + kernels.append("N/A") + tflops.append(lst[33]) + dtype.append(lst[5]) + alayout.append(lst[8]) + blayout.append(lst[11]) + M.append(lst[14]) + N.append(lst[17]) + K.append(lst[20]) + StrideA.append(lst[23]) + StrideB.append(lst[26]) + StrideC.append(lst[29]) + print("warning: incomplete line:",lst) + elif len(lst)<33: #even the tflops are not available + print("Error in ckProfiler output!") + print("warning: incomplete line=",lst) + + #sort results + print("Number of tests:",len(tests)) + print("Branch name:",branch_name) + #sorted_tests = sorted(tests) + #print("sorted tests:",sorted_tests) + sorted_tflops = [x for _,x in sorted(zip(tests,tflops))] + #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] + test_list=list(range(1,len(tests)+1)) + + sql_hostname = '127.0.0.1' + sql_username = os.environ["dbuser"] + print("sql_username=",sql_username) + sql_password = os.environ["dbpassword"] + sql_main_database = 'miopen_perf' + sql_port = 3306 + ssh_host = os.environ["dbsship"] + print("ssh_host=",ssh_host) + ssh_user = os.environ["dbsshuser"] + print("ssh_user=",ssh_user) + ssh_port = int(os.environ["dbsshport"]) + ssh_pass = os.environ["dbsshpassword"] + + with SSHTunnelForwarder( + (ssh_host, ssh_port), + ssh_username=ssh_user, + ssh_password=ssh_pass, + remote_bind_address=(sql_hostname, sql_port)) as tunnel: + + sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. + format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) + conn = sqlEngine.connect() + + #write the ck_gemm_test_params table + #only needed once the test set changes + ''' + sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))] + sorted_alayout = [x for _,x in sorted(zip(tests,alayout))] + sorted_blayout = [x for _,x in sorted(zip(tests,blayout))] + sorted_M = [x for _,x in sorted(zip(tests,M))] + sorted_N = [x for _,x in sorted(zip(tests,N))] + sorted_K = [x for _,x in sorted(zip(tests,K))] + sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))] + sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))] + sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))] + ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout, + sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB, + sorted_StrideC] + df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type', + 'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC']) + print(df) + + dtypes = { + 'Test_number': Integer(), + 'Data_type': NVARCHAR(length=5), + 'Alayout': NVARCHAR(length=12), + 'Blayout': NVARCHAR(length=12), + 'M': Integer(), + 'N': Integer(), + 'K': Integer(), + 'StrideA': Integer(), + 'StrideB': Integer(), + 'StrideC': Integer() + } + df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes) + ''' + + #read baseline results for the latest develop branch + query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );''' + tflops_base = pd.read_sql_query(query, conn) + + #write new results to the db + testlist=[] + for i in range(1,len(tests)+1): + testlist.append("Test%i"%i) + ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())] + flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime']) + df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist) + flops=pd.concat([flops,df_add],axis=1) + print("new tflops results:",flops) + flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False) + conn.close() + + #compare the results to the baseline + regression=0 + base=tflops_base[testlist].to_numpy(dtype='float') + base_list=base[0] + ave_perf=0 + for i in range(len(base_list)): + # success criterion: + if base_list[i]>1.01*float(sorted_tflops[i]): + print("test # ",i,"shows regression by {:.3f}%".format( + (float(sorted_tflops[i])-base_list[i])/base_list[i]*100)) + regression=1 + ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i] + if regression==0: + print("no regressions found") + ave_perf=ave_perf/len(base_list) + print("average performance relative to baseline:",ave_perf) + + #return 0 if performance criteria met, otherwise return 1 + + return regression + +if __name__ == '__main__': main() \ No newline at end of file