[CK_TILE] FMHA bwd Support hdim as a Multiple of 32 (#2130)

* Fix shuffle_tile

* Add fmha bwd d160

* CHANGELOG

* Use static_cast

* Update

---------

Co-authored-by: asleepzzz <hanwen.chang@amd.com>
This commit is contained in:
Yi DING
2025-07-29 09:31:14 +08:00
committed by GitHub
parent 7fe50dc3da
commit 1926cd0cb8
5 changed files with 446 additions and 141 deletions

View File

@@ -357,6 +357,8 @@ def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict
"kr_ktr_vr_iglp", "kr_ktr_vr"],
'128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
"kr_ktr_vr_iglp", "kr_ktr_vr"],
# '160' : [FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
# "kr_ktr_vr_iglp", "kr_ktr_vr"],
'256' : [FmhaBwdDQDKDVTileSize( 16, 64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
"kr_ktr_vr_iglp", "kr_ktr_vr"]
}