Input/output permutation for fused attention (#460)

* reopen masking att instance due to CI is upgraded * re-enable instances previously failed on 9110 * enable ksize-kpadding pair validity test * add non-masked attention+permute test; expose masking boolean to attention kernel handles * disable bench * fix test * move files * bulk rename batched_gemm_masking_scale_softmax_gemm_permute to batched_gemm_softmax_gemm_permute * format * amend rename * disable bench in test * add mask/no-mask test for non-permute attention kernels * disable broken kernel instance * example working add non-permuted problem statement evaluating whether overhead comes from permutation or the extra kernel arg * interface for bias addition without implementing it * test and profiler running * tidy * mask type determined by enum class * unify example code * move masking specialization to its own header * align formats * extract helper functions * experiment merging dims for attn w/ permute; shows perf parity with attn wo/ permute * add tensor specialization to template args since tensor spec packed shows perf parity when permutation isn't needed remove redundant template args comment on 'packed' tensor specialization * grouped attention with input/output permute example * format * clean up * refactor acc0 tile visitor Co-authored-by: shaojiewang <wsjmessi@163.com> Co-authored-by: Chao Liu <chao.liu2@amd.com>
2026-04-20 14:59:17 +00:00 · 2022-10-28 04:58:20 +08:00
parent cd51732690
commit de37550f72
42 changed files with 2654 additions and 2196 deletions
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -593,7 +593,8 @@ struct XdlopsGemm
    static constexpr auto I4 = Number<4>{};
    static constexpr auto I5 = Number<5>{};

-    using CIndex = MultiIndex<2>;
+    using CIndex   = MultiIndex<2>;
+    using CIndex4D = MultiIndex<4>;

    __device__ static constexpr index_t GetNumBlks() { return mfma_instr.num_output_blks; }

@@ -822,6 +823,16 @@ struct XdlopsGemm
        return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset};
    }

+    __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */)
+    {
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
+    }
+
    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};

    static constexpr auto mfma_instr = mfma.selected_mfma;