add fmha fwd splitkv receipt for aiter c++ api (#2068)

* add s_randval for c++ api * Fix bug of bias in splitkv --------- Co-authored-by: rocking <ChunYu.Lai@amd.com> [ROCm/composable_kernel commit: 5f885d2b7a]
2026-05-14 10:09:41 +00:00 · 2025-04-10 23:21:13 +08:00
parent c4e0659e51
commit cca9cca699
5 changed files with 30 additions and 10 deletions
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -545,10 +545,9 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    cond &= dpad == dvpad
                    if not cond:
                        continue
+            # aiter::mha_bwd C++ api integration
            elif receipt == 600:
                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode in ["batch", "group"]
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
                    cond &= dpad == dvpad
                    if not cond:
                        continue
@@ -689,6 +688,11 @@ def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaB
                    cond &= mode == "group"
                    if not cond:
                        continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
            gen.append(k)

    return gen
@@ -841,6 +845,11 @@ def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[Fmh
                    cond &= mode == "group"
                    if not cond:
                        continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
            gen.append(k)

    return gen
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -536,10 +536,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
-                # Aiter aiter::mha_fwd integration
-                elif receipt == 500:
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode in ['batch', 'group']
                    cond &= pipeline.F_vlayout == 'row'
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -738,6 +738,13 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                api_pool.register_traits(k.api_trait())
                gen.append(k)

@@ -796,6 +803,11 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                    cond &= mode == "group"
                    if not cond:
                        continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
                gen.append(k)

    return gen
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -109,8 +109,8 @@ if __name__ == "__main__":
             "  100-199: Only generate instance for Aiter(mha_fwd) integration\n" + \
             "  200-299: Only generate instance for Aiter(mha_varlen_fwd) integration\n" + \
             "  300-399: Only generate instance for Aiter(mha_bwd) integration\n" + \
-             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration"
-
+             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration\n" + \
+             "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration"
    )

    args = parser.parse_args()
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -95,8 +95,8 @@ struct FmhaFwdSplitKVKernel
            "w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
            (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
            "v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "_npad" : "_" + pn) +
-            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) + 
-            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) + 
+            (BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("_nbias") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
+            (kHasMask ? "_" + _SS_(FmhaMask::name) : "_nmask") + (kStoreLSE ? "_lse" : "_nlse" ) +
            (kDoFp8StaticQuant ? "_squant" : "_nsquant") + (kIsPagedKV ? "_pagedkv" : "_npagedkv" );
        #undef _SS_
        #undef _TS_
@@ -563,7 +563,7 @@ struct FmhaFwdSplitKVKernel
            }
            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
            {
-                batch_offset_bias = query_start * kargs.stride_bias + key_start;
+                batch_offset_bias = query_start * kargs.stride_bias;
            }

            batch_offset_lse_acc = query_start;