Add batched attention special kernel instances (#424)

* sanity check * add attribution * add irrgular k tile size for batched attention * format
2026-05-11 08:50:17 +00:00 · 2022-09-20 08:20:54 +08:00
parent c6b8b472a7
commit 7c788e10ce
5 changed files with 51 additions and 10 deletions
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -649,6 +649,9 @@ struct BlockwiseGemmXdlops_v2
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);

+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+
    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                              FloatAcc,
                              MRepeat * NRepeat,
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -881,9 +881,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
                    FloatGemmAcc c_new =
                        (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c +
                         math::exp(max[iM] - running_max_new[iM]) * acc1) /
-                        running_sum_new[iM]; // O_new
+                        running_sum_new[iM]; // Formula by Dao et al.,
+                                             // https://arxiv.org/pdf/2205.14135v2.pdf section 3.1

-                    c_thread_buf(I) = c_new;
+                    c_thread_buf(I) = c_new; // O_new
                });
            });