[CK_TILE] FMHA BWD Decode Pipeline (#2643)

* Fix distr

* Duplicate block_fmha_bwd_dq_dk_dv_pipeline_trload_kr_ktr_vr

* decode 16x16 o2
This commit is contained in:
Yi DING
2025-08-12 17:02:52 +08:00
committed by GitHub
parent 352f87e684
commit 8e1eb0c1ee
11 changed files with 1051 additions and 165 deletions

View File

@@ -73,7 +73,7 @@ struct Default2DEpilogue
// how do we fix this ?
template <typename ODramWindowTmp, typename OAccTile>
CK_TILE_DEVICE auto
operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr)
operator()(ODramWindowTmp& o_dram_window_tmp, const OAccTile& o_acc_tile, void* = nullptr) const
{
// TODO: this is ugly
if constexpr(UseRawStore && (kPadM || kPadN))
@@ -105,7 +105,7 @@ struct Default2DEpilogue
CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
const OAccTile& o_acc_tile,
const DsDramWindows& /* unused */,
void* = nullptr)
void* = nullptr) const
{
return operator()<ODramWindowTmp, OAccTile>(o_dram_window_tmp, o_acc_tile);
}