[CK_TILE] FMHA FAv3 scheduling fine-tuning for performance (#2833)

* Re-mapping thread block indices for causal=True kernels * Use more intuitive remap_opt value * Fallback to origin remapping if seqlen_q >= 64K * Use GenericAttentionMask to reduce mask computation * Avoid unnecessary boundary check for IsMasking=false case * Fix wrong kernel entry specifier * Add s_nop to prevent delay wave0-3 * Refine scheduling * Remove unnecessary sched_group_barrier() * Move sched_group_barrier() call to scheduler * Replace inline asm s_setprio with intrinsics * Rephrase comments * Expend some o_acc rescaling insts to avoid SIMD idle * Fix block idx special mapping logic * Tune block index mapping for causal=False cases * Tune block index mapping for causal=True cases * Fix wrong vmcnt() * Remove parameter name * Use boolean option for turn on/off causal mask * Update benchmark_fwd_v3.sh option usages * Add option if compiler support it
2026-04-20 06:49:15 +00:00 · 2025-09-16 11:32:38 +08:00
parent 7d7ded62d3
commit 7fbc9d6c97
8 changed files with 250 additions and 112 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -213,8 +213,20 @@ list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
  -Wno-undefined-func-template
  --save-temps
 )
-target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
+set(EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS)

+check_cxx_compiler_flag("-mllvm --amdgpu-disable-packed-fp32=1" HAS_DISABLE_PACKED_FP32)
+if(HAS_DISABLE_PACKED_FP32)
+  list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS
+    -mllvm --amdgpu-disable-packed-fp32=1
+  )
+  list(APPEND EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS
+    -DCK_TILE_DISABLE_PACKED_FP32=1
+  )
+endif()
+
+target_compile_options(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_OPTIONS})
+target_compile_definitions(${EXAMPLE_FMHA_FWD_V3} PRIVATE ${EXAMPLE_FMHA_FWD_V3_COMPILE_DEFINITIONS})
 # TODO: we have to turn off this global prop, otherwise the progress bar generated
 # by cmake will print too many files, execvp: /bin/sh: Argument list too long
 # however, this property may affect global
--- a/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_fwd_v3.cpp
@@ -45,18 +45,7 @@ auto parse_cmd_args(int argc, char* argv[]) -> std::pair<bool, ck_tile::ArgParse
                "permute input\n"
                "if true, will be b*h*s*d, else b*s*h*d")
        .insert("operm", "0", "permute output")
-        .insert("mask",
-                "0",
-                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
-                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
-                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
-                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
-                "'xt:window_size', xformer style masking from top-left, window_size negative is "
-                "causal, positive is swa\n"
-                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
-                "causal, positive is swa\n"
-                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
-                "now)")
+        .insert("causal", "0", "0: no mask, 1: causal mask")
        .insert("v", "1", "0:no verify, 1:verify")
        .insert("seed",
                "11939",
@@ -109,7 +98,16 @@ struct Problem
        softmax_scale = args.get_float("scale_s");
        if(softmax_scale == .0f)
            softmax_scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim));
-        mask = mask_info::decode(args.get_str("mask"), seqlen_q, seqlen_k);
+
+        const auto is_causal = args.get_bool("causal");
+        if(is_causal)
+        {
+            mask = mask_info::decode("b:-1,0", seqlen_q, seqlen_k);
+        }
+        else
+        {
+            mask = mask_info::decode("0", seqlen_q, seqlen_k);
+        }

        input_layout  = args.get_int("iperm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
        output_layout = args.get_int("operm") == 1 ? TensorLayout::bhsd : TensorLayout::bshd;
--- a/example/ck_tile/01_fmha/fmha_fwd_v3.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_v3.hpp
@@ -34,7 +34,8 @@ struct fmha_fwd_v3_args

    index_t window_size_left;
    index_t window_size_right;
-    index_t mask_type;
+    index_t mask_type; // should be 0 for no mask; or 2 for causal mask (window_size_left < 0 and
+                       // window_size_right == 0).

    const void* q_ptr;
    index_t stride_q;
--- a/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_v3_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp"

 #include "fmha_fwd_v3.hpp"
+#include "mask.hpp"

 #define INST_FMHA_FWD_V3_DISPATCH(kernel_traits)                                               \
    template <>                                                                                \
@@ -79,7 +80,7 @@ struct fmha_fwd_v3_kernel_traits
                                            -1     // kBlockPerCu
                                            >;

-    using fmha_mask = SimplifiedGenericAttentionMask<IsMasking>;
+    using fmha_mask = GenericAttentionMask<IsMasking, /*IsLocal=*/false>;

    using fmha_pipeline_problem =
        BlockFmhaFwdV3PipelineProblem<typename fmha_fwd_v3_problem_traits<date_type>::qkvp_dtype,
@@ -112,6 +113,22 @@ struct fmha_fwd_v3_kernel_traits
 template <typename Kernel>
 float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_config& config)
 {
+    /// NOTICE: This was borrowed from Aiter. Make sure the selected remap_opt setting truly
+    /// maximizes the kernel's performance.
+    int remap_opt = 2;
+    if(args.mask_type != static_cast<int>(mask_enum::no_mask) &&
+       ((args.nhead_q % 8 != 0) || (16384 < args.seqlen_q)))
+    {
+        if(65536 <= args.seqlen_q)
+        {
+            remap_opt = 0;
+        }
+        else
+        {
+            remap_opt = 1;
+        }
+    }
+
    auto kargs = Kernel::MakeKargs(args.q_ptr,
                                   args.k_ptr,
                                   args.v_ptr,
@@ -140,7 +157,8 @@ float fmha_fwd_v3_kernel_launch(const fmha_fwd_v3_args& args, const stream_confi
                                   args.batch_stride_o,
                                   args.window_size_left,
                                   args.window_size_right,
-                                   args.mask_type);
+                                   args.mask_type,
+                                   remap_opt);

    dim3 grids            = Kernel::GridSize(args.batch, args.nhead_q, args.seqlen_q, args.hdim_v);
    constexpr dim3 blocks = Kernel::BlockSize();
--- a/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
@@ -8,22 +8,16 @@ for prec in "fp16" "bf16" ; do
 for hdim in 128 ; do
 for perm in 0 ; do

-if [ $causal -eq 0 ]; then
-    mask=0
-else
-    mask=b:-1,0
-fi
-
-$EXE -prec=$prec -b=32 -h=16        -s=512   -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=16 -h=16        -s=1024  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=8  -h=16        -s=2048  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=4  -h=16        -s=4096  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=2  -h=16        -s=8192  -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=16        -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=32 -h=16        -s=512   -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=16 -h=16        -s=1024  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=8  -h=16        -s=2048  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=4  -h=16        -s=4096  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=2  -h=16        -s=8192  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
                                          
-$EXE -prec=$prec -b=1  -h=64        -s=16384 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=16 -h_k=1 -s=65536 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
-$EXE -prec=$prec -b=1  -h=40        -s=37200 -d=$hdim -mask=$mask -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=64        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16 -h_k=1 -s=65536 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=40        -s=37200 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID

 done
 done