flashattention fwd add (80, 96) instance (#3415)

* add hdim (96,96) instance

* change to (80,96)

* format py

* remove 96 in optdim

* when N=6 change to llvm_amdgcn_raw_buffer_load_i32x3
This commit is contained in:
ltqin
2025-12-18 01:16:11 +08:00
committed by GitHub
parent fe3d52d9b0
commit 92653168c2
6 changed files with 127 additions and 6 deletions

View File

@@ -12,6 +12,8 @@ static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length()
{
if constexpr(Headdim == 48)
return 48;
else if constexpr(Headdim == 80)
return 96;
else if constexpr(Headdim == 96)
return 128;
else if constexpr(Headdim == 160)