flashattention fwd add (80, 96) instance (#3415)

* add hdim (96,96) instance

* change to (80,96)

* format py

* remove 96 in optdim

* when N=6 change to llvm_amdgcn_raw_buffer_load_i32x3
This commit is contained in:
ltqin
2025-12-18 01:16:11 +08:00
committed by GitHub
parent fe3d52d9b0
commit 92653168c2
6 changed files with 127 additions and 6 deletions

View File

@@ -152,6 +152,7 @@ using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
// i32
// using int32_t = ...
using int32x2_t = int32_t __attribute__((ext_vector_type(2)));
using int32x3_t = int32_t __attribute__((ext_vector_type(3)));
using int32x4_t = int32_t __attribute__((ext_vector_type(4)));
using int32x8_t = int32_t __attribute__((ext_vector_type(8)));
using int32x16_t = int32_t __attribute__((ext_vector_type(16)));