[CK_TILE] FMHA FAv3 scheduling fine-tuning for performance (#2833)

* Re-mapping thread block indices for causal=True kernels

* Use more intuitive remap_opt value

* Fallback to origin remapping if seqlen_q >= 64K

* Use GenericAttentionMask to reduce mask computation

* Avoid unnecessary boundary check for IsMasking=false case

* Fix wrong kernel entry specifier

* Add s_nop to prevent delay wave0-3

* Refine scheduling

* Remove unnecessary sched_group_barrier()

* Move sched_group_barrier() call to scheduler

* Replace inline asm s_setprio with intrinsics

* Rephrase comments

* Expend some o_acc rescaling insts to avoid SIMD idle

* Fix block idx special mapping logic

* Tune block index mapping for causal=False cases

* Tune block index mapping for causal=True cases

* Fix wrong vmcnt()

* Remove parameter name

* Use boolean option for turn on/off causal mask

* Update benchmark_fwd_v3.sh option usages

* Add option if compiler support it
This commit is contained in:
Po Yen Chen
2025-09-16 11:32:38 +08:00
committed by GitHub
parent 7d7ded62d3
commit 7fbc9d6c97
8 changed files with 250 additions and 112 deletions

View File

@@ -213,8 +213,20 @@ list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
-Wno-undefined-func-template
--save-temps
)
target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
set(EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS)
check_cxx_compiler_flag("-mllvm --amdgpu-disable-packed-fp32=1" HAS_DISABLE_PACKED_FP32)
if(HAS_DISABLE_PACKED_FP32)
list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
-mllvm --amdgpu-disable-packed-fp32=1
)
list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS
-DCK_TILE_DISABLE_PACKED_FP32=1
)
endif()
target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
target_compile_definitions(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS})
# TODO: we have to turn off this global prop, otherwise the progress bar generated
# by cmake will print too many files, execvp: /bin/sh: Argument list too long
# however, this property may affect global

View File

@@ -45,18 +45,7 @@ auto parse_cmd_args(int argc, char* argv[]) -> std::pair<bool, ck_tile::ArgParse
"permute input\n"
"if true, will be b*h*s*d, else b*s*h*d")
.insert("operm", "0", "permute output")
.insert("mask",
"0",
"0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
"'t', top-left causal mask, 'b', bottom-r causal mask\n"
"'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
"'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
"'xt:window_size', xformer style masking from top-left, window_size negative is "
"causal, positive is swa\n"
"'xb:window_size', xformer style masking from bottom-r, window_size negative is "
"causal, positive is swa\n"
"'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
"now)")
.insert("causal", "0", "0: no mask, 1: causal mask")
.insert("v", "1", "0:no verify, 1:verify")
.insert("seed",
"11939",
@@ -109,7 +98,16 @@ struct Problem
softmax_scale = args.get_float("scale_s");
if(softmax_scale == .0f)
softmax_scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim));
mask = mask_info::decode(args.get_str("mask"), seqlen_q, seqlen_k);
const auto is_causal = args.get_bool("causal");
if(is_causal)
{
mask = mask_info::decode("b:-1,0", seqlen_q, seqlen_k);
}
else
{
mask = mask_info::decode("0", seqlen_q, seqlen_k);
}
input_layout = args.get_int("iperm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
output_layout = args.get_int("operm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;

View File

@@ -34,7 +34,8 @@ struct fmha_fwd_v3_args
index_t window_size_left;
index_t window_size_right;
index_t mask_type;
index_t mask_type; // should be 0 for no mask; or 2 for causal mask (window_size_left < 0 and
// window_size_right == 0).
const void* q_ptr;
index_t stride_q;

View File

@@ -18,6 +18,7 @@
#include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"
#include "fmha_fwd_v3.hpp"
#include "mask.hpp"
#define INST_FMHA_FWD_V3_DISPATCH(kernel_traits) \
template <> \
@@ -79,7 +80,7 @@ struct fmha_fwd_v3_kernel_traits
-1 // kBlockPerCu
>;
using fmha_mask = SimplifiedGenericAttentionMask<IsMasking>;
using fmha_mask = GenericAttentionMask<IsMasking, /*IsLocal=*/false>;
using fmha_pipeline_problem =
BlockFmhaFwdV3PipelineProblem<typename fmha_fwd_v3_problem_traits<date_type>::qkvp_dtype,
@@ -112,6 +113,22 @@ struct fmha_fwd_v3_kernel_traits
template <typename Kernel>
float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_config& config)
{
/// NOTICE: This was borrowed from Aiter. Make sure the selected remap_opt setting truly
/// maximizes the kernel's performance.
int remap_opt = 2;
if(args.mask_type != static_cast<int>(mask_enum::no_mask) &&
((args.nhead_q % 8 != 0) || (16384 < args.seqlen_q)))
{
if(65536 <= args.seqlen_q)
{
remap_opt = 0;
}
else
{
remap_opt = 1;
}
}
auto kargs = Kernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.v_ptr,
@@ -140,7 +157,8 @@ float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_confi
args.batch_stride_o,
args.window_size_left,
args.window_size_right,
args.mask_type);
args.mask_type,
remap_opt);
dim3 grids = Kernel::GridSize(args.batch, args.nhead_q, args.seqlen_q, args.hdim_v);
constexpr dim3 blocks = Kernel::BlockSize();

View File

@@ -8,22 +8,16 @@ for prec in "fp16" "bf16" ; do
for hdim in 128 ; do
for perm in 0 ; do
if [ $causal -eq 0 ]; then
mask=0
else
mask=b:-1,0
fi
$EXE -prec=$prec -b=32 -h=16 -s=512 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=16 -h=16 -s=1024 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=8 -h=16 -s=2048 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=4 -h=16 -s=4096 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=2 -h=16 -s=8192 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=16 -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=32 -h=16 -s=512 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=16 -h=16 -s=1024 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=8 -h=16 -s=2048 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=4 -h=16 -s=4096 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=2 -h=16 -s=8192 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=16 -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=64 -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=16 -h_k=1 -s=65536 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=40 -s=37200 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=64 -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=16 -h_k=1 -s=65536 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
$EXE -prec=$prec -b=1 -h=40 -s=37200 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
done
done