From 02db6094b99e041c340e5b5ee897bfee89d24e01 Mon Sep 17 00:00:00 2001 From: Yi DING Date: Wed, 24 Sep 2025 17:04:23 +0800 Subject: [PATCH] [CK_TILE] FMHA BWD Add D96 Instances (#2916) [ROCm/composable_kernel commit: fe0a47a011c2adcb54dfc94a3029feb7b9980deb] --- example/ck_tile/01_fmha/CMakeLists.txt | 2 +- .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 1 + .../ck_tile/01_fmha/script/run_full_test.sh | 8 ++-- .../ck_tile/01_fmha/script/smoke_test_bwd.sh | 40 ++++++++++++------- ...block_fmha_bwd_pipeline_default_policy.hpp | 12 ++---- 5 files changed, 35 insertions(+), 28 deletions(-) diff --git a/example/ck_tile/01_fmha/CMakeLists.txt b/example/ck_tile/01_fmha/CMakeLists.txt index a2d7fe4aaf..b8ca26193d 100644 --- a/example/ck_tile/01_fmha/CMakeLists.txt +++ b/example/ck_tile/01_fmha/CMakeLists.txt @@ -47,7 +47,7 @@ set(FMHA_BWD_CODE_GEN_COMMON_ARGS ${CMAKE_CURRENT_LIST_DIR}/generate.py --api bwd --receipt 3 - --optdim 32,64,128,256 + --optdim 32,64,96,128,256 # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd... ) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 4df99a9a10..36482e94c1 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -380,6 +380,7 @@ def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize] return [ FmhaBwdDQDKDVTileSize( 32, 128, 32, 32, 32, 32, 64, 32, 32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 32, 128, 64, 32, 64, 32, 32, 64, 64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), + FmhaBwdDQDKDVTileSize( 32, 128, 96, 32, 96, 32, 32, 96, 96, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1), FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1), diff --git a/example/ck_tile/01_fmha/script/run_full_test.sh b/example/ck_tile/01_fmha/script/run_full_test.sh index e7babd2744..5c2a5a4b3d 100755 --- a/example/ck_tile/01_fmha/script/run_full_test.sh +++ b/example/ck_tile/01_fmha/script/run_full_test.sh @@ -34,15 +34,15 @@ function print_log_header(){ } #run verification tests -example/ck_tile/01_fmha/script/smoke_test_fwd.sh -example/ck_tile/01_fmha/script/smoke_test_bwd.sh +time example/ck_tile/01_fmha/script/smoke_test_fwd.sh +time example/ck_tile/01_fmha/script/smoke_test_bwd.sh #run performance benchmarks export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log" print_log_header $fmha_fwd_log $env_type $branch $host_name -example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log +time example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log" print_log_header $fmha_bwd_log $env_type $branch $host_name -example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log +time example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log diff --git a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh index 3b59505ff0..ec2a6a0ceb 100755 --- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh +++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh @@ -6,7 +6,7 @@ SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) EXE_NAME=tile_example_fmha_bwd EXE="$(find . -name $EXE_NAME -type f | head -n 1)" KNAME=1 -GPU_arch=$GPU_arch +GPU_arch=${GPU_arch:-""} if [ -z "$GPU_arch" ] ; then GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}') fi @@ -31,7 +31,17 @@ run_exe() { set -ex } +test_h_s_mask() { + run_exe -b=1 -h=4 -h_k=2 -s=259 $@ + run_exe -b=2 -h=2 -s=516 -s_k=253 $@ + run_exe -b=1 -h=4 -h_k=1 -s=500 -s_k=251 -mask=1 $@ + run_exe -b=1 -h=2 -s=900 -s_k=258 -mask=2 $@ + run_exe -b=2 -h=1 -s=987 -s_k=219 -mask=t:128,30 $@ + run_exe -b=2 -h=3 -h_k=1 -s=244 -s_k=499 -mask=b:4,35 $@ +} + set -x +# main tests for prec in "fp16" "bf16" ; do for perm in 0 1 ; do for hdim in 32 64 128 256 ; do @@ -40,21 +50,21 @@ for bias in "n" "a" ; do for dbias in 0 ; do for p_drop in 0.0 0.2 ; do for deterministic in 0 ; do +test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS +done +done +done +done +done +done +done +done -run_exe -prec=$prec -b=1 -h=4 -h_k=2 -d=$hdim -s=259 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=2 -d=$hdim -s=516 -s_k=253 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=1 -h=4 -h_k=1 -d=$hdim -s=500 -s_k=251 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=1 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=1 -h=2 -d=$hdim -s=900 -s_k=258 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=2 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=1 -d=$hdim -s=987 -s_k=219 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=t:128,30 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS -run_exe -prec=$prec -b=2 -h=3 -h_k=1 -d=$hdim -s=244 -s_k=499 -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -mask=b:4,35 -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS - -done -done -done -done -done -done -done +# additional cases +for hdim in 72 96 ; do +test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS +test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS +test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS done set +x diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp index ad9e2959f5..5eac387a66 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp @@ -408,8 +408,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kNPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -457,8 +456,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy tuple, sequence<2, 0>>, sequence<1, 2>, // N0 K1 sequence<0, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kNPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -507,8 +505,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kMPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; } @@ -558,8 +555,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy sequence<1, 2>, sequence<2, 1>>{}); - if constexpr(container_reduce(dstr.get_lengths(), std::multiplies{}, 1) == - kMPerBlock * kKPerBlock) + if constexpr((kKPerBlock & (kKPerBlock - 1)) == 0) // kKPerBlock is power of 2 { return dstr; }