[CK_TILE] Add sequence padding and variable length support in fmha (a… (#2851)

* [CK_TILE] Add sequence padding and variable length support in fmha (and v3) - Group Mode Padding: Introduces the `-s_qpad` argument to support physically padded layouts. Kernels now use padded start pointers (`seqstart_padded_*_ptr`) for memory addressing. - Batch Mode Variable Length: Adds `-q_eff_lens` and `-kv_eff_lens` arguments for efficient processing of variable-length sequences by passing cumulative effective lengths (`cu_seqlen_*_ptr`) to the kernel. - FMHA examples: Support padding and variable length both in group and batch mode. Dispatcher is updated as well (dispatch to kPadSeqLenK enabled pipeline). - New padding test cases: Add padding test cases to `smoke_test_fwd.sh`, and add benchmarks to `benchmark_fwd.sh` and `benchmark_fwd_v3.sh` as well. These test cases and benchmarks that specifically validate/benchmark the new padding and variable-length functionalities in both group and batch modes. * [CK_TILE] Fix build error in fmha unit tests --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> Co-authored-by: Yi DING <yi.ding@amd.com>
2026-05-02 12:41:26 +00:00 · 2025-09-19 17:36:49 +08:00
parent 2aec38f9ec
commit 86dd59cd01
13 changed files with 1032 additions and 60 deletions
--- a/example/ck_tile/01_fmha/example_fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_fwd.cpp
@@ -33,6 +33,10 @@ auto create_args(int argc, char* argv[])
                "0",
                "seqlen_k for new key/value, 0 means not to use this at all; "
                "-1 to choose s_knew in [1, s] randomly.")
+        .insert("s_qpad",
+                "-1",
+                "seqlen_q stride between 2 batches (group-mode optional).\n"
+                "Provide positive strides per-batch to simulate physical padding on Q.")
        .insert("s_kpad",
                "-1",
                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
@@ -107,7 +111,15 @@ auto create_args(int argc, char* argv[])
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel")
        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
-        .insert("jsonfile", "fmha_fwd.json", "json file name to dump results");
+        .insert("jsonfile", "fmha_fwd.json", "json file name to dump results")
+        .insert("q_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.")
+        .insert("kv_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -127,6 +139,9 @@ auto run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t hdim_v          = arg_parser.get_int("d_v");
    ck_tile::index_t seqlen_knew     = arg_parser.get_int("s_knew");
    auto seqlen_kpads                = arg_parser.get_int_vec("s_kpad");
+    auto seqlen_qpads                = arg_parser.get_int_vec("s_qpad");
+    auto q_eff_lens_per_batch        = arg_parser.get_int_vec("q_eff_lens");
+    auto kv_eff_lens_per_batch       = arg_parser.get_int_vec("kv_eff_lens");
    ck_tile::index_t rotary_dim      = arg_parser.get_int("rotary_dim");
    bool i_perm                      = arg_parser.get_bool("iperm");
    bool o_perm                      = arg_parser.get_bool("operm");
@@ -174,7 +189,10 @@ auto run(const ck_tile::ArgParser& arg_parser)
                                        hdim_q,
                                        hdim_v,
                                        seqlen_knew,
+                                        seqlen_qpads,
                                        seqlen_kpads,
+                                        q_eff_lens_per_batch,
+                                        kv_eff_lens_per_batch,
                                        rotary_dim,
                                        i_perm,
                                        o_perm,