Re-enable optimization for gfx950 fmha fwd (#2671)

* Fix for fwd/bwd kernel build filter * fix bwd code * save an example for __bf16 type * temp save, waiting for debug * tempsave, fmha_decode * temp save, change all instance to 1wave * fix async copytest bug * Add block_sync_lds_direct_load utility * fix the s_waitcnt_imm calculation * Improve s_waitcnt_imm calculation * fix vmcnt shift * add input validation and bug fix * remove unnecessary output * move test_copy into test * temp save * tempsave * compile pass * tempsave, trload+asyncload done * tempsave. asynccopy+trload sanity checked * remove unnecessary features * fix the lds alignment caused performance regression * enable prefill overload operator(). * remove all lds bankconflict with xor layouts * enable larger tile size; upgrade xor pattern * upgrade prefill pipeline; simple iglp; consistent data produce and consume order * small refactor * Load Q through lds, implement xor; * add vmcnt guard before load ktile * Add v_permlaneb32 for block_reduce. Disable it as it will cause un-coexecutable packed math in FA * Add XOR fold strategy for hdim<128, but perf dropped; disable it by default; wait further perf debug * add __restrict__ to tr load * merge fa_decode pipeline into fmha_fwd api * remove unnecessary files; rename some files * Remove unnecessary changes * bug fix, clang format; * remove non-necessary change * fix clangformat with 18.1.3 * fix bugs * fix bug * fix bug on non-gfx950 * fix bugs in gemm * fix bug in pki4 * tempsave, update the blocksync functions * change the warp setting for hdim32 fmha fwd * clang format * fix conflict. disable all v-col instance for fmha fwd * Fix the bug * clang format * refactor blockgemm change, isolate to v2; --------- Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: asleepzzz <hanwen.chang@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-08-13 14:57:43 +08:00
parent 452791a3ba
commit 05a6e92705
27 changed files with 3767 additions and 534 deletions
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -14,10 +14,14 @@ namespace ck_tile {
 * Y dim must have at least one dim not been reduced
 */
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
-template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
+template <typename AccDistributedTensor_,
+          typename ReduceFunc,
+          bool WithBroadcast = true,
+          bool CrossWarp     = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
                                           const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                           bool_constant<WithBroadcast> = {},
+                                           bool_constant<CrossWarp>     = {})
 {
    using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
    using DstrEncode       = typename Dstr::DstrEncode;
@@ -56,14 +60,24 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,

                // reduction sweep forward
                static_for<0, nstage, 1>{}([&](auto istage) {
-                    constexpr index_t lid_delta =
-                        lid_over_rid_derivative * (1 << (nstage - istage - 1));
+                    if constexpr(CrossWarp)
+                    {
+                        constexpr index_t lid_delta =
+                            lid_over_rid_derivative * (1 << (nstage - istage - 1));

-                    // pull data from remote lane
-                    const auto v_remote = warp_shuffle_down(v_local, lid_delta);
+                        // pull data from remote lane
+                        const auto v_remote = warp_shuffle_down(v_local, lid_delta);

-                    // reduce
-                    v_local = reduce_func(v_local, v_remote);
+                        // reduce
+                        v_local = reduce_func(v_local, v_remote);
+                    }
+                    else
+                    {
+                        // pull data from remote lane
+                        const auto v_swapped_regs = warp_shuffle_down_pair(v_local);
+                        // reduce
+                        v_local = reduce_func(v_swapped_regs.at(0), v_swapped_regs.at(1));
+                    }
                });
            }
        });