[CK_TILE] Add permuteN optimization to remove lds operation in c_shuffle (#2764)

* permuteN optimization to remove lds operation in c_shuffle

* add the change log

---------

Co-authored-by: ThomasNing <thomas.ning@amd.com>
This commit is contained in:
lalala-sh
2025-09-09 13:02:48 +08:00
committed by GitHub
parent 92b07380d3
commit 75570d0fa8
5 changed files with 189 additions and 4 deletions

View File

@@ -276,6 +276,8 @@ struct GemmConfigPreshuffleDecode : public GemmConfigBase
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
static constexpr bool Preshuffle = true;
static constexpr bool DoubleSmemBuffer = true;
static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp;
static constexpr bool TiledMMAPermuteN = N_Repeat % 2 == 0;
};
template <typename PrecType>
@@ -298,6 +300,8 @@ struct GemmConfigPreshufflePrefill : public GemmConfigBase
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
static constexpr bool Preshuffle = true;
static constexpr bool DoubleSmemBuffer = true;
static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp;
static constexpr bool TiledMMAPermuteN = N_Repeat % 2 == 0;
};
template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>