flashattention fwd add (80, 96) instance (#3415)

* add hdim (96,96) instance * change to (80,96) * format py * remove 96 in optdim * when N=6 change to llvm_amdgcn_raw_buffer_load_i32x3
2026-04-19 22:39:03 +00:00 · 2025-12-18 01:16:11 +08:00
parent fe3d52d9b0
commit 92653168c2
6 changed files with 127 additions and 6 deletions
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -152,6 +152,7 @@ using bf16x64_t = bfloat16_t __attribute__((ext_vector_type(64)));
 // i32
 // using int32_t = ...
 using int32x2_t  = int32_t __attribute__((ext_vector_type(2)));
+using int32x3_t  = int32_t __attribute__((ext_vector_type(3)));
 using int32x4_t  = int32_t __attribute__((ext_vector_type(4)));
 using int32x8_t  = int32_t __attribute__((ext_vector_type(8)));
 using int32x16_t = int32_t __attribute__((ext_vector_type(16)));