flashattention fwd add (80, 96) instance (#3415)

* add hdim (96,96) instance

* change to (80,96)

* format py

* remove 96 in optdim

* when N=6 change to llvm_amdgcn_raw_buffer_load_i32x3
This commit is contained in:
ltqin
2025-12-18 01:16:11 +08:00
committed by GitHub
parent fe3d52d9b0
commit 92653168c2
6 changed files with 127 additions and 6 deletions

View File

@@ -47,7 +47,7 @@ set(FMHA_FWD_CODE_GEN_COMMON_ARGS
${CMAKE_CURRENT_LIST_DIR}/generate.py
--targets ${FMHA_TARGETS_ARG}
--api ${FMHA_FWD_APIS}
--optdim 32,64,128,256
--optdim 32,64,80,128,256
# --filter fmha_fwd...
)
set(FMHA_BWD_CODE_GEN_COMMON_ARGS