From f273d0a6992d7e7031b3ded3a10128de218c198d Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Thu, 9 Jan 2025 17:41:49 -0800 Subject: [PATCH] Ck tile/gemm perf measure (#1750) * Finished adding the performance benchmark for ck tile gemm * Fix the executable rename problem * fix the executable name error * delete the unsupported layout combinations * Update run_full_test.sh * Update benchmark_mem_pipeline.sh * Update benchmark_basic.sh * change the executable of gemm_universal * change ck_tile_gemm script permissions * Addressed the comment * Addressed the comment * Fixed the comments * Fixed Comment * roll back the malfunctioned change * Fix the Typo * finalize the tile_gemm_fp16 performance monitoring * fix the stash names for ck_tile gemm logs * change the stashing logic * change stashing syntax --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: illsilin [ROCm/composable_kernel commit: 73a076eee1cdc035de176f6061f4f1f5bfc1bd02] --- Jenkinsfile | 47 ++++++++++++++++--- example/ck_tile/03_gemm/CMakeLists.txt | 2 +- example/ck_tile/03_gemm/README.md | 7 ++- .../ck_tile/03_gemm/script/benchmark_basic.sh | 13 +++++ .../03_gemm/script/benchmark_mem_pipeline.sh | 13 +++++ .../ck_tile/03_gemm/script/run_full_test.sh | 26 ++++++++-- .../{smoke_test.sh => smoke_test_basic.sh} | 2 +- .../03_gemm/script/smoke_test_mem_pipeline.sh | 35 ++++++++++++++ script/process_perf_data.py | 14 ++++++ script/process_perf_data.sh | 16 +++++++ script/process_qa_data.sh | 16 +++++++ 11 files changed, 178 insertions(+), 13 deletions(-) create mode 100755 example/ck_tile/03_gemm/script/benchmark_basic.sh create mode 100755 example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh rename example/ck_tile/03_gemm/script/{smoke_test.sh => smoke_test_basic.sh} (99%) create mode 100755 example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh diff --git a/Jenkinsfile b/Jenkinsfile index 87c9457fcb..8dc5899d3b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -326,12 +326,38 @@ def cmake_build(Map conf=[:]){ if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) { archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true } + //check the node gpu architecture + def arch_type = 0 + sh 'rocminfo | tee rocminfo.log' + if ( runShell('grep -n "gfx90a" rocminfo.log') ){ + arch_type = 1 + } + else if ( runShell('grep -n "gfx942" rocminfo.log') ) { + arch_type = 2 + } if (params.RUN_CK_TILE_FMHA_TESTS){ try{ - archiveArtifacts "perf_fmha_fwd_*.log" - archiveArtifacts "perf_fmha_bwd_*.log" - stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" - stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" + archiveArtifacts "perf_fmha_*.log" + if (arch_type == 1){ + stash includes: "perf_fmha_**_gfx90a.log", name: "perf_fmha_log_gfx90a" + } + else if (arch_type == 2){ + stash includes: "perf_fmha_**_gfx942.log", name: "perf_fmha_log_gfx942" + } + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + archiveArtifacts "perf_tile_gemm_*.log" + if (arch_type == 1){ + stash includes: "perf_tile_gemm_**_fp16_gfx90a.log", name: "perf_tile_gemm_log_gfx90a" + } + else if (arch_type == 2){ + stash includes: "perf_tile_gemm_**_fp16_gfx942.log", name: "perf_tile_gemm_log_gfx942" + } } catch(Exception err){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." @@ -630,6 +656,15 @@ def process_results(Map conf=[:]){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." } } + if (params.RUN_CK_TILE_GEMM_TESTS){ + try{ + unstash "perf_tile_gemm_log_gfx942" + unstash "perf_tile_gemm_log_gfx90a" + } + catch(Exception err){ + echo "could not locate the GEMM performance logs: ${err.getMessage()}." + } + } if (params.RUN_FULL_QA){ // unstash perf files to master unstash "ckprofiler_0.2.0_amd64.deb" @@ -956,7 +991,7 @@ pipeline { environment{ setup_args = "NO_CK_BUILD" execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ - make -j64 tile_example_gemm_basic && \ + make -j64 tile_example_gemm_basic tile_example_gemm_universal && \ cd ../ && example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ } @@ -975,7 +1010,7 @@ pipeline { environment{ setup_args = "NO_CK_BUILD" execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ - make -j64 tile_example_gemm_basic && \ + make -j64 tile_example_gemm_basic tile_example_gemm_universal && \ cd ../ && example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ } diff --git a/example/ck_tile/03_gemm/CMakeLists.txt b/example/ck_tile/03_gemm/CMakeLists.txt index d166eed458..bc3799f015 100644 --- a/example/ck_tile/03_gemm/CMakeLists.txt +++ b/example/ck_tile/03_gemm/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp) -add_executable(tile_example_universal_gemm EXCLUDE_FROM_ALL universal_gemm.cpp) +add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp) diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md index e9ffe72a91..4c16f13cef 100644 --- a/example/ck_tile/03_gemm/README.md +++ b/example/ck_tile/03_gemm/README.md @@ -11,9 +11,9 @@ sh ../script/cmake-ck-dev.sh ../ # The basic pipeline method on the gemm calculation make tile_example_gemm_basic -j # The memory bound pipeline on the gemm calculation -make tile_example_gemm_mem_pipeline -j +make tile_example_gemm_universal -j ``` -This will result in an executable `build/bin/tile_example_gemm_basic` +This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal` ## example ``` @@ -22,6 +22,9 @@ args: -m m dimension (default:1024) -n n dimension (default:2048) -k k dimension (default:64) + -a_layout Tensor A data layout (default: R) + -b_layout Tensor B data layout (default: R) + -c_layout Tensor C data layout (default: R) -stride_a Tensor A stride (default:0) -stride_b Tensor B stride (default:0) -stride_c Tensor C stride (default:0) diff --git a/example/ck_tile/03_gemm/script/benchmark_basic.sh b/example/ck_tile/03_gemm/script/benchmark_basic.sh new file mode 100755 index 0000000000..f5473e46f4 --- /dev/null +++ b/example/ck_tile/03_gemm/script/benchmark_basic.sh @@ -0,0 +1,13 @@ +#!/bin/sh +EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)" +VALID=0 + +for b_matrix_layout in "R" "C"; do + for m in "64" "512" "1024" "2048"; do + for n in "512" "1024" "2048"; do + for k in "64" "512" "1024" "2048"; do + $EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID + done + done + done +done diff --git a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh new file mode 100755 index 0000000000..a3029cbeb5 --- /dev/null +++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh @@ -0,0 +1,13 @@ +#!/bin/sh +EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)" +VALID=0 + +for b_matrix_layout in "R" "C"; do + for m in "64" "512" "1024" "2048"; do + for n in "512" "1024" "2048"; do + for k in "64" "512" "1024" "2048"; do + $EXE -prec=fp16 -b=1 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID + done + done + done +done diff --git a/example/ck_tile/03_gemm/script/run_full_test.sh b/example/ck_tile/03_gemm/script/run_full_test.sh index 2e2e7fdf90..45bd1bed61 100755 --- a/example/ck_tile/03_gemm/script/run_full_test.sh +++ b/example/ck_tile/03_gemm/script/run_full_test.sh @@ -19,7 +19,27 @@ echo 'Host name: ' $host_name export GPU_arch=$4 echo 'GPU_arch: ' $GPU_arch -# run verification tests -example/ck_tile/03_gemm/script/smoke_test.sh +function print_log_header(){ + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + # get GPU architecture and compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; +} -# We do not have a performance benchmark for gemm yet. Will add it in the future. \ No newline at end of file +# run verification tests +example/ck_tile/03_gemm/script/smoke_test_basic.sh +example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh + +# run performance benchmarks +export gemm_basic_log="perf_tile_gemm_basic_fp16_$GPU_arch.log" +print_log_header $gemm_basic_log $env_type $branch $host_name +example/ck_tile/03_gemm/script/benchmark_basic.sh 2>&1 | tee -a $gemm_basic_log + +export gemm_mem_pipeline_log="perf_tile_gemm_mem_pipeline_fp16_$GPU_arch.log" +print_log_header $gemm_mem_pipeline_log $env_type $branch $host_name +example/ck_tile/03_gemm/script/benchmark_mem_pipeline.sh 2>&1 | tee -a $gemm_mem_pipeline_log diff --git a/example/ck_tile/03_gemm/script/smoke_test.sh b/example/ck_tile/03_gemm/script/smoke_test_basic.sh similarity index 99% rename from example/ck_tile/03_gemm/script/smoke_test.sh rename to example/ck_tile/03_gemm/script/smoke_test_basic.sh index 4d9a64bf40..8eb4e101a0 100755 --- a/example/ck_tile/03_gemm/script/smoke_test.sh +++ b/example/ck_tile/03_gemm/script/smoke_test_basic.sh @@ -32,4 +32,4 @@ set -x run_fp16_tests -set +x \ No newline at end of file +set +x diff --git a/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh b/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh new file mode 100755 index 0000000000..a9c7f48da0 --- /dev/null +++ b/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh @@ -0,0 +1,35 @@ +#!/bin/bash +EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)" +KNAME=1 + +export CK_WARMUP=0 +export CK_REPEAT=1 + +COMMON_ARGS='-v=2 -warmup=0 -repeat=1' + +run_fp16_tests() { + for batch in 1 2; do + for m in 128 1024; do + for n in 128 2048; do + for k in 32 64; do + + $EXE -b=$batch -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -e=1e-5 -prec=fp16 $COMMON_ARGS + if [ $? -eq 0 ]; then + echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully." + else + echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly." + # Optionally, exit or break if you need to halt further execution + # exit 1 + fi + + done + done + done + done +} + +set -x + +run_fp16_tests + +set +x diff --git a/script/process_perf_data.py b/script/process_perf_data.py index 32e2e15d7a..0d56c9baa2 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -149,6 +149,12 @@ def parse_logfile(logfile): lst=line.split() line_dict=dict(zip(lst[1:],lst)) res.append(line_dict['TFlops,']) + elif 'perf_tile_gemm_basic' in logfile or 'perf_tile_gemm_mem_pipeline' in logfile: + for line in open(logfile): + if 'TFlops' in line: + lst=line.split() + line_dict=dict(zip(lst[1:],lst)) + res.append(line_dict['TFlops,']) return res @@ -330,6 +336,14 @@ def main(): for i in range(1,len(results)+1): testlist.append("Test%i"%i) table_name="ck_fmha_bwd_tflops" + if 'gemm_basic_fp16' in filename: + for i in range(1, len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_tile_gemm_basic_fp16_tflops" + if 'gemm_mem_pipeline_fp16' in filename: + for i in range(1, len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_tile_gemm_mem_pipeline_fp16_tflops" tflops_base = get_baseline(table_name,conn) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, sqlEngine) diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh index ae93463204..815cf41e2d 100755 --- a/script/process_perf_data.sh +++ b/script/process_perf_data.sh @@ -43,3 +43,19 @@ file=./perf_fmha_bwd_gfx90a.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_bwd_gfx90a.log fi +file=./perf_tile_gemm_basic_fp16_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx942.log +fi +file=./perf_tile_gemm_basic_fp16_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_tile_gemm_basic_fp16_gfx90a.log +fi +file=./perf_tile_gemm_mem_pipeline_fp16_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx942.log +fi +file=./perf_tile_gemm_mem_pipeline_fp16_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_tile_gemm_mem_pipeline_fp16_gfx90a.log +fi diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index fb8fe01c6e..c5bc1b9a1a 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -52,3 +52,19 @@ file=./perf_fmha_bwd_gfx90a.log if [ -e "$file" ]; then python3 process_perf_data.py perf_fmha_bwd_gfx90a.log fi +file=./perf_gemm_basic_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_gemm_basic_gfx942.log +fi +file=./perf_gemm_basic_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_gemm_basic_gfx90a.log +fi +file=./perf_gemm_mem_pipeline_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_gemm_mem_pipeline_gfx942.log +fi +file=./perf_gemm_mem_pipeline_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_gemm_mem_pipeline_gfx90a.log +fi