diff --git a/CMakeLists.txt b/CMakeLists.txt index ae22691ab5..c00db26f3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) message("Adding the enable-post-misched=0 compiler flag") add_compile_options("SHELL: -mllvm -enable-post-misched=0") endif() -if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000) +set(check-coerce) +check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce) +if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000) message("Adding the amdgpu-coerce-illegal-types=1") add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1") endif() diff --git a/Dockerfile b/Dockerfile index 1c67ab3090..04d0357f51 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 ARG DEBIAN_FRONTEND=noninteractive -ARG ROCMVERSION=6.1 +ARG ROCMVERSION=6.2 ARG compiler_version="" ARG compiler_commit="" ARG CK_SCCACHE="" @@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN if [ "$ROCMVERSION" != "6.2" ]; then \ - sh -c "wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/focal/amdgpu-install_6.1.60100-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.1.60100-1_all.deb && \ +RUN if [ "$ROCMVERSION" != "6.3" ]; then \ + sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \ + apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \ wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \ sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \ sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \ - elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc4" ]; then \ - sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \ - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \ - sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-63 > /etc/apt/sources.list.d/rocm-build.list' && \ - amdgpu-repo --amdgpu-build=2009461; \ fi RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" @@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- python3-dev \ python3-pip \ redis \ + rocm-llvm-dev \ sshpass \ stunnel \ software-properties-common \ diff --git a/Jenkinsfile b/Jenkinsfile index b6df09bf8a..1cbb0f5313 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -38,7 +38,7 @@ def getDockerImageName(){ img = "${params.USE_CUSTOM_DOCKER}" } else{ - if (params.ROCMVERSION != "6.2"){ + if (params.ROCMVERSION != "6.3"){ if (params.COMPILER_VERSION == "") { img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}" } @@ -285,6 +285,19 @@ def cmake_build(Map conf=[:]){ if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) { archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true } + if (params.RUN_CK_TILE_TESTS){ + try{ + archiveArtifacts "perf_fmha_fwd_*.log" + archiveArtifacts "perf_fmha_bwd_*.log" + stash name: "perf_fmha_fwd_gfx942.log" + stash name: "perf_fmha_bwd_gfx942.log" + stash name: "perf_fmha_fwd_gfx90a.log" + stash name: "perf_fmha_bwd_gfx90a.log" + } + catch(Exception err){ + echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." + } + } } def buildHipClangJob(Map conf=[:]){ @@ -612,6 +625,17 @@ def process_results(Map conf=[:]){ timeout(time: 1, unit: 'HOURS'){ try{ dir("script"){ + if (params.RUN_CK_TILE_TESTS){ + try{ + unstash "perf_fmha_fwd_gfx942.log" + unstash "perf_fmha_bwd_gfx942.log" + unstash "perf_fmha_fwd_gfx90a.log" + unstash "perf_fmha_bwd_gfx90a.log" + } + catch(Exception err){ + echo "could not locate the FMHA performance logs: ${err.getMessage()}." + } + } if (params.RUN_FULL_QA){ // unstash perf files to master unstash "ckprofiler_0.2.0_amd64.deb" @@ -652,8 +676,8 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true - 0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true + 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : "" @@ -677,8 +701,8 @@ pipeline { description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( name: 'ROCMVERSION', - defaultValue: '6.1', - description: 'Specify which ROCM version to use: 6.1 (default).') + defaultValue: '6.2', + description: 'Specify which ROCM version to use: 6.2 (default).') string( name: 'COMPILER_VERSION', defaultValue: '', @@ -852,8 +876,7 @@ pipeline { execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ cd ../ && - example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \ - example/ck_tile/01_fmha/script/smoke_test_bwd.sh""" + example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) @@ -872,8 +895,7 @@ pipeline { execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ cd ../ && - example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \ - example/ck_tile/01_fmha/script/smoke_test_bwd.sh""" + example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ } steps{ buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 374f64ce28..49ff317876 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1,2 +1,2 @@ -rocm-docs-core==1.6.1 +rocm-docs-core==1.6.2 sphinxcontrib-bibtex==2.6.2 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 0e0fe4dcc4..bc7d0f689e 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -103,7 +103,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core==1.6.1 +rocm-docs-core==1.6.2 # via -r requirements.in six==1.16.0 # via pybtex diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh new file mode 100755 index 0000000000..b5e6778aa5 --- /dev/null +++ b/example/ck_tile/01_fmha/script/run_full_test.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/ +# +# run the script as "./run_full_test.sh +# input arguments: +# environment tag : a string describing the specifics of your test environment +# branch name : name of the branch in git repo (git status | grep -e 'On branch') +# host name : $hostname +# gpu architecture: e.g., gfx90a, or gfx942, etc. + +#get the command line arguments: +export env_type=$1 +echo 'Environment type: ' $env_type +export branch=$2 +echo 'Branch name: ' $branch +export host_name=$3 +echo 'Host name: ' $host_name +export GPU_arch=$4 +echo 'GPU_arch: ' $GPU_arch + +function print_log_header(){ + rm -f $1; + echo 'On branch ' $3 &> $1; + echo 'Node name: ' $4 >> $1; + #get GPU_arch and number of compute units from rocminfo + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1; + rocminfo | grep "Compute Unit:" >> $1; + hipcc --version | grep -e 'HIP version' >> $1; + echo 'Environment type: ' $2 >> $1; + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1; +} + +#run verification tests +example/ck_tile/01_fmha/script/smoke_test_fwd.sh +example/ck_tile/01_fmha/script/smoke_test_bwd.sh + +#run performance benchmarks +export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log" +print_log_header $fmha_fwd_log $env_type $branch $host_name +example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log + +export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log" +print_log_header $fmha_bwd_log $env_type $branch $host_name +example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log + diff --git a/include/ck/utility/f8_utils.hpp b/include/ck/utility/f8_utils.hpp index 98e8092af5..2533073225 100644 --- a/include/ck/utility/f8_utils.hpp +++ b/include/ck/utility/f8_utils.hpp @@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng) // convert to bitwise using T_bitwise = typename NumericUtils::bitwise_type; - T_bitwise x_bitwise = *(reinterpret_cast(&x)); + T_bitwise x_bitwise = bit_cast(x); // unpack the input, depends on datatype head = x_bitwise & NumericUtils::head_mask; @@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x) // prepare the codes constexpr X nan_code = 0x80; - Y Inf, NegInf, NaN, Neg0; - using T_bitwise = typename NumericUtils::bitwise_type; + using T_bitwise = typename NumericUtils::bitwise_type; constexpr T_bitwise Inf_bitwise = NumericUtils::Inf; constexpr T_bitwise NegInf_bitwise = NumericUtils::NegInf; constexpr T_bitwise NaN_bitwise = NumericUtils::NaN; constexpr T_bitwise Neg0_bitwise = NumericUtils::Neg0; - Inf = *(reinterpret_cast(&Inf_bitwise)); - NegInf = *(reinterpret_cast(&NegInf_bitwise)); - NaN = *(reinterpret_cast(&NaN_bitwise)); - Neg0 = *(reinterpret_cast(&Neg0_bitwise)); + constexpr Y Inf = bit_cast(Inf_bitwise); + constexpr Y NegInf = bit_cast(NegInf_bitwise); + constexpr Y NaN = bit_cast(NaN_bitwise); + constexpr Y Neg0 = bit_cast(Neg0_bitwise); // check if x is 0.0 if(x == 0) @@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x) { retval = x; retval <<= 8; - return *(reinterpret_cast(&retval)); + return bit_cast(retval); } // subnormal input @@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x) } retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa; - return *(reinterpret_cast(&retval)); + return bit_cast(retval); } } // namespace diff --git a/script/process_perf_data.py b/script/process_perf_data.py index 2c46da8fd2..c6cb6e05c7 100644 --- a/script/process_perf_data.py +++ b/script/process_perf_data.py @@ -143,6 +143,12 @@ def parse_logfile(logfile): if 'Best Perf' in line: lst=line.split() res.append(lst[36]) + elif 'perf_fmha' in logfile: + for line in open(logfile): + if 'TFlops' in line: + lst=line.split() + line_dict=dict(zip(lst[1:],lst)) + res.append(line_dict['TFlops,']) return res @@ -304,6 +310,14 @@ def main(): for i in range(1,len(results)+1): testlist.append("Test%i"%i) table_name="ck_mixed_gemm_tflops" + if 'fmha_fwd' in filename: + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_fmha_fwd_tflops" + if 'fmha_bwd' in filename: + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ck_fmha_bwd_tflops" tflops_base = get_baseline(table_name,conn) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn) diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh index 15fc5cb15f..af1e7e7a0d 100755 --- a/script/process_perf_data.sh +++ b/script/process_perf_data.sh @@ -13,3 +13,20 @@ python3 process_perf_data.py perf_gemm.log python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N4.log + +file=./perf_fmha_fwd_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_fwd_gfx942.log +fi +file=./perf_fmha_bwd_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_bwd_gfx942.log +fi +file=./perf_fmha_fwd_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_fwd_gfx90a.log +fi +file=./perf_fmha_bwd_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_bwd_gfx90a.log +fi diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh index abf1e6234e..bf16f05cd0 100755 --- a/script/process_qa_data.sh +++ b/script/process_qa_data.sh @@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log python3 process_perf_data.py perf_reduction.log python3 process_perf_data.py perf_splitK_gemm.log python3 process_perf_data.py perf_onnx_gemm.log + +file=./perf_fmha_fwd_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_fwd_gfx942.log +fi +file=./perf_fmha_bwd_gfx942.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_bwd_gfx942.log +fi +file=./perf_fmha_fwd_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_fwd_gfx90a.log +fi +file=./perf_fmha_bwd_gfx90a.log +if [ -e "$file" ]; then + python3 process_perf_data.py perf_fmha_bwd_gfx90a.log +fi