mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-07 00:04:37 +00:00
Merge branch 'develop' into feature/fmha-fwd-appendkv
This commit is contained in:
@@ -189,7 +189,9 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
|
||||
message("Adding the enable-post-misched=0 compiler flag")
|
||||
add_compile_options("SHELL: -mllvm -enable-post-misched=0")
|
||||
endif()
|
||||
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
|
||||
set(check-coerce)
|
||||
check_cxx_compiler_flag(" -mllvm -amdgpu-coerce-illegal-types=1" check-coerce)
|
||||
if(NOT WIN32 AND check-coerce AND ${hip_VERSION_FLAT} GREATER 600241132 AND ${hip_VERSION_FLAT} LESS 600300000)
|
||||
message("Adding the amdgpu-coerce-illegal-types=1")
|
||||
add_compile_options("SHELL: -mllvm -amdgpu-coerce-illegal-types=1")
|
||||
endif()
|
||||
|
||||
14
Dockerfile
14
Dockerfile
@@ -1,6 +1,6 @@
|
||||
FROM ubuntu:20.04
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG ROCMVERSION=6.1
|
||||
ARG ROCMVERSION=6.2
|
||||
ARG compiler_version=""
|
||||
ARG compiler_commit=""
|
||||
ARG CK_SCCACHE=""
|
||||
@@ -17,17 +17,12 @@ RUN apt-get install -y --allow-unauthenticated apt-utils wget gnupg2 curl
|
||||
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
|
||||
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
|
||||
|
||||
RUN if [ "$ROCMVERSION" != "6.2" ]; then \
|
||||
sh -c "wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/focal/amdgpu-install_6.1.60100-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.1.60100-1_all.deb && \
|
||||
RUN if [ "$ROCMVERSION" != "6.3" ]; then \
|
||||
sh -c "wget https://repo.radeon.com/amdgpu-install/$ROCMVERSION/ubuntu/focal/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ./amdgpu-install_6.2.60200-1_all.deb && \
|
||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
|
||||
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
|
||||
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
|
||||
elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc4" ]; then \
|
||||
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
|
||||
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
|
||||
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-63 > /etc/apt/sources.list.d/rocm-build.list' && \
|
||||
amdgpu-repo --amdgpu-build=2009461; \
|
||||
fi
|
||||
|
||||
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
|
||||
@@ -64,6 +59,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
|
||||
python3-dev \
|
||||
python3-pip \
|
||||
redis \
|
||||
rocm-llvm-dev \
|
||||
sshpass \
|
||||
stunnel \
|
||||
software-properties-common \
|
||||
|
||||
40
Jenkinsfile
vendored
40
Jenkinsfile
vendored
@@ -38,7 +38,7 @@ def getDockerImageName(){
|
||||
img = "${params.USE_CUSTOM_DOCKER}"
|
||||
}
|
||||
else{
|
||||
if (params.ROCMVERSION != "6.2"){
|
||||
if (params.ROCMVERSION != "6.3"){
|
||||
if (params.COMPILER_VERSION == "") {
|
||||
img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
|
||||
}
|
||||
@@ -285,6 +285,19 @@ def cmake_build(Map conf=[:]){
|
||||
if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "amd-master")) {
|
||||
archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
|
||||
}
|
||||
if (params.RUN_CK_TILE_TESTS){
|
||||
try{
|
||||
archiveArtifacts "perf_fmha_fwd_*.log"
|
||||
archiveArtifacts "perf_fmha_bwd_*.log"
|
||||
stash name: "perf_fmha_fwd_gfx942.log"
|
||||
stash name: "perf_fmha_bwd_gfx942.log"
|
||||
stash name: "perf_fmha_fwd_gfx90a.log"
|
||||
stash name: "perf_fmha_bwd_gfx90a.log"
|
||||
}
|
||||
catch(Exception err){
|
||||
echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def buildHipClangJob(Map conf=[:]){
|
||||
@@ -612,6 +625,17 @@ def process_results(Map conf=[:]){
|
||||
timeout(time: 1, unit: 'HOURS'){
|
||||
try{
|
||||
dir("script"){
|
||||
if (params.RUN_CK_TILE_TESTS){
|
||||
try{
|
||||
unstash "perf_fmha_fwd_gfx942.log"
|
||||
unstash "perf_fmha_bwd_gfx942.log"
|
||||
unstash "perf_fmha_fwd_gfx90a.log"
|
||||
unstash "perf_fmha_bwd_gfx90a.log"
|
||||
}
|
||||
catch(Exception err){
|
||||
echo "could not locate the FMHA performance logs: ${err.getMessage()}."
|
||||
}
|
||||
}
|
||||
if (params.RUN_FULL_QA){
|
||||
// unstash perf files to master
|
||||
unstash "ckprofiler_0.2.0_amd64.deb"
|
||||
@@ -652,8 +676,8 @@ def process_results(Map conf=[:]){
|
||||
}
|
||||
|
||||
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true
|
||||
0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true
|
||||
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2; RUN_CK_TILE_TESTS=true
|
||||
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
|
||||
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
|
||||
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false
|
||||
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false''' : ""
|
||||
@@ -677,8 +701,8 @@ pipeline {
|
||||
description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
|
||||
string(
|
||||
name: 'ROCMVERSION',
|
||||
defaultValue: '6.1',
|
||||
description: 'Specify which ROCM version to use: 6.1 (default).')
|
||||
defaultValue: '6.2',
|
||||
description: 'Specify which ROCM version to use: 6.2 (default).')
|
||||
string(
|
||||
name: 'COMPILER_VERSION',
|
||||
defaultValue: '',
|
||||
@@ -852,8 +876,7 @@ pipeline {
|
||||
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
|
||||
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
|
||||
cd ../ &&
|
||||
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
|
||||
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
|
||||
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
@@ -872,8 +895,7 @@ pipeline {
|
||||
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
|
||||
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
|
||||
cd ../ &&
|
||||
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
|
||||
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
|
||||
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
|
||||
}
|
||||
steps{
|
||||
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
rocm-docs-core==1.6.1
|
||||
rocm-docs-core==1.6.2
|
||||
sphinxcontrib-bibtex==2.6.2
|
||||
|
||||
@@ -103,7 +103,7 @@ requests==2.32.3
|
||||
# via
|
||||
# pygithub
|
||||
# sphinx
|
||||
rocm-docs-core==1.6.1
|
||||
rocm-docs-core==1.6.2
|
||||
# via -r requirements.in
|
||||
six==1.16.0
|
||||
# via pybtex
|
||||
|
||||
46
example/ck_tile/01_fmha/script/run_full_test.sh
Executable file
46
example/ck_tile/01_fmha/script/run_full_test.sh
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
|
||||
#
|
||||
# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
|
||||
# input arguments:
|
||||
# environment tag : a string describing the specifics of your test environment
|
||||
# branch name : name of the branch in git repo (git status | grep -e 'On branch')
|
||||
# host name : $hostname
|
||||
# gpu architecture: e.g., gfx90a, or gfx942, etc.
|
||||
|
||||
#get the command line arguments:
|
||||
export env_type=$1
|
||||
echo 'Environment type: ' $env_type
|
||||
export branch=$2
|
||||
echo 'Branch name: ' $branch
|
||||
export host_name=$3
|
||||
echo 'Host name: ' $host_name
|
||||
export GPU_arch=$4
|
||||
echo 'GPU_arch: ' $GPU_arch
|
||||
|
||||
function print_log_header(){
|
||||
rm -f $1;
|
||||
echo 'On branch ' $3 &> $1;
|
||||
echo 'Node name: ' $4 >> $1;
|
||||
#get GPU_arch and number of compute units from rocminfo
|
||||
echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
|
||||
rocminfo | grep "Compute Unit:" >> $1;
|
||||
hipcc --version | grep -e 'HIP version' >> $1;
|
||||
echo 'Environment type: ' $2 >> $1;
|
||||
/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
|
||||
}
|
||||
|
||||
#run verification tests
|
||||
example/ck_tile/01_fmha/script/smoke_test_fwd.sh
|
||||
example/ck_tile/01_fmha/script/smoke_test_bwd.sh
|
||||
|
||||
#run performance benchmarks
|
||||
export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
|
||||
print_log_header $fmha_fwd_log $env_type $branch $host_name
|
||||
example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
|
||||
|
||||
export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
|
||||
print_log_header $fmha_bwd_log $env_type $branch $host_name
|
||||
example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
|
||||
|
||||
@@ -44,7 +44,7 @@ __host__ __device__ Y run_cast_to_f8(X x, uint32_t rng)
|
||||
|
||||
// convert to bitwise
|
||||
using T_bitwise = typename NumericUtils<X>::bitwise_type;
|
||||
T_bitwise x_bitwise = *(reinterpret_cast<T_bitwise*>(&x));
|
||||
T_bitwise x_bitwise = bit_cast<T_bitwise>(x);
|
||||
|
||||
// unpack the input, depends on datatype
|
||||
head = x_bitwise & NumericUtils<X>::head_mask;
|
||||
@@ -196,18 +196,17 @@ __host__ __device__ Y run_cast_from_f8(X x)
|
||||
|
||||
// prepare the codes
|
||||
constexpr X nan_code = 0x80;
|
||||
Y Inf, NegInf, NaN, Neg0;
|
||||
using T_bitwise = typename NumericUtils<Y>::bitwise_type;
|
||||
using T_bitwise = typename NumericUtils<Y>::bitwise_type;
|
||||
|
||||
constexpr T_bitwise Inf_bitwise = NumericUtils<Y>::Inf;
|
||||
constexpr T_bitwise NegInf_bitwise = NumericUtils<Y>::NegInf;
|
||||
constexpr T_bitwise NaN_bitwise = NumericUtils<Y>::NaN;
|
||||
constexpr T_bitwise Neg0_bitwise = NumericUtils<Y>::Neg0;
|
||||
|
||||
Inf = *(reinterpret_cast<const Y*>(&Inf_bitwise));
|
||||
NegInf = *(reinterpret_cast<const Y*>(&NegInf_bitwise));
|
||||
NaN = *(reinterpret_cast<const Y*>(&NaN_bitwise));
|
||||
Neg0 = *(reinterpret_cast<const Y*>(&Neg0_bitwise));
|
||||
constexpr Y Inf = bit_cast<Y>(Inf_bitwise);
|
||||
constexpr Y NegInf = bit_cast<Y>(NegInf_bitwise);
|
||||
constexpr Y NaN = bit_cast<Y>(NaN_bitwise);
|
||||
constexpr Y Neg0 = bit_cast<Y>(Neg0_bitwise);
|
||||
|
||||
// check if x is 0.0
|
||||
if(x == 0)
|
||||
@@ -240,7 +239,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
|
||||
{
|
||||
retval = x;
|
||||
retval <<= 8;
|
||||
return *(reinterpret_cast<const Y*>(&retval));
|
||||
return bit_cast<Y>(retval);
|
||||
}
|
||||
|
||||
// subnormal input
|
||||
@@ -264,7 +263,7 @@ __host__ __device__ Y run_cast_from_f8(X x)
|
||||
}
|
||||
|
||||
retval = (sign << (out_exp + out_mant)) | (exponent << out_mant) | mantissa;
|
||||
return *(reinterpret_cast<const Y*>(&retval));
|
||||
return bit_cast<Y>(retval);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
|
||||
if 'Best Perf' in line:
|
||||
lst=line.split()
|
||||
res.append(lst[36])
|
||||
elif 'perf_fmha' in logfile:
|
||||
for line in open(logfile):
|
||||
if 'TFlops' in line:
|
||||
lst=line.split()
|
||||
line_dict=dict(zip(lst[1:],lst))
|
||||
res.append(line_dict['TFlops,'])
|
||||
return res
|
||||
|
||||
|
||||
@@ -304,6 +310,14 @@ def main():
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_mixed_gemm_tflops"
|
||||
if 'fmha_fwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_fmha_fwd_tflops"
|
||||
if 'fmha_bwd' in filename:
|
||||
for i in range(1,len(results)+1):
|
||||
testlist.append("Test%i"%i)
|
||||
table_name="ck_fmha_bwd_tflops"
|
||||
|
||||
tflops_base = get_baseline(table_name,conn)
|
||||
store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
|
||||
|
||||
@@ -13,3 +13,20 @@
|
||||
python3 process_perf_data.py perf_gemm.log
|
||||
python3 process_perf_data.py perf_resnet50_N256.log
|
||||
python3 process_perf_data.py perf_resnet50_N4.log
|
||||
|
||||
file=./perf_fmha_fwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
|
||||
fi
|
||||
|
||||
@@ -21,3 +21,20 @@ python3 process_perf_data.py perf_gemm_bilinear.log
|
||||
python3 process_perf_data.py perf_reduction.log
|
||||
python3 process_perf_data.py perf_splitK_gemm.log
|
||||
python3 process_perf_data.py perf_onnx_gemm.log
|
||||
|
||||
file=./perf_fmha_fwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx942.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
|
||||
fi
|
||||
file=./perf_fmha_fwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
|
||||
fi
|
||||
file=./perf_fmha_bwd_gfx90a.log
|
||||
if [ -e "$file" ]; then
|
||||
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user