mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 20:51:23 +00:00
[CK_TILE] Add fmha fwd N-Warp S-Shuffle pipeline (fmha fwd splitkv pipeline variant) (#1705)
* Add check for zero values * Add static assertions * Remove invalid option '-e' in smoke_test.sh * Use correct path of smoke_test.sh * Avoid zero-sized shared memory array * Add warning comment * Replace expr by integer_divide_ceil() call * Use more readable constant names * Write down assumption as static assertion * Add more diagnostic error messages * Fix wrong BlockWarps when using default pipeline policy * Add more static assertions for A LDS desc * Allow using vector size < 8 for data type fp16/bf16 * Align vector size between DRAM dist & LDS desc * Remove no-longer used func decl * Fix wrong displayed piepline name * Undo policy template changes for tile_example_gemm_basic * Add missing space and make error message stands out * Unify print precision * Add missing include directive <iomanip> * Replace constant 64 by get_warp_size() call * Replace constant 128 by named variable: BankLength * Add kAMBlock/kBNBlock attributes * Allow usig different A/B warp dist for multiple blocks * Add helper function to get warp dist encodings * Add 4x64x4 fp16 warp gemm attribute impl * Complete the A/B warp dist encoding logic * Fix wrong thread mapping for C matrix * Use smaller vector size for small tile * Add static assert to block unsupported warp gemm impl * Extract common code out as helper method * Add 4x64x16 fp16 warp gemm type alias * Add comment to warning developers * Undo WarpGemmAtrributeMfma<> changes * Use more clear static assertion error message * Add trivial wrapper to get warp dstr encodings * Only transpose warp gemm result if it's square * Fix compilation error * Support multi-block warp gemm (on N direction) * Remove duplicated code * Fix output encoding of warp gemm * Fix wrong shape of WarpGemmAtrributeMfmaIterateK<> * Remove unused code * Fix wrong shape of WarpGemmAttributeMfmaImplF16F16F32M4N64K4 * Add type config for bf16_t * Add 4x64x16 bf16 warp gemm * Update WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution * Add 64x4x4 fp16/bf16 warp gemm impl * Add 64x4x16 fp16/bf16 warp gemm * Add static assertion for better error diagnostic * Get Q dram dstr directly form block gemm * Add missing header: fused_moe.hpp * Allow specifying different warp-gemm for gemm0 & gemm1 * Store P matrix into LDS before gemm1 * Fix inconsistant kernel name * Remove constraint on gemm0 & gemm1 block warps * Remove unsupported vector size from checking list * Allow using 4x64x16 warp gemm for gemm0 * Finish policy customization * Finish pipeline modification F# * Use block warps in codegen * Fix wrong rank of m_lds_window origin * Use better distributed tensor * Make P-store earlier * Remove duplicated experssions * Remove unnecessary tile window * Create new files for new splitkv pipeline * Separate old/new pipeline codegen logic * Sync changes form develop * Undo gemm kernel/pipeline changes * Undo gemm example changes * Remove blank lines * Fix typo * Use new warp gemm interface * Fix link error * Fix wrong pipeline tag * Fix more link error * Avoid unnecessary padding * Always use vector load for K * Padding on fastest dimension when necessary * Force padding Q on hdim_q * Set high dimension padding flag to false * Re-format headers * Use warps=<1, 4, 1> for both gemm0 & gemm1 * Fix complilation errors * Remove m/l shuffle logics * Ignore duplicate data when write lse_acc * Use gemm0 block warps as lds tile width * Remove hard-coded numbers * Fix wrong distribution width * Remove unnecessary code * Add s_barrier before writing to LDS * Store Q into LDS before gemm0 * Fix wrong Q tile size * Use simple Q lds descriptor for debuging * Use more realistic Q lds descriptor * Add comment & use better variable name * Make Q lds space not overlapped with others * Remove unnecessary block_tile_reduce_sync() call * Move Q load statements * Move block_sync_lds() right before use * Re-order instructions * Remove necessary lambda expression * Use 8 threads on kMaxSplits direction while doing reduction * Tiny correction for using 8 threads on kMaxSplits direction for combine kernel * Padding num_split direction of o_acc tile window to 4x * Update splitkv combine pipeline design * Add kN1 back to splitkv combine pipeline problem * Fix compilation errors * Add missing template parameter * Fix wrong splitkv combine kernel name * Fix wrong origin * Fix wrong LDS descriptor shape * Fix sync & reduction logics * Remove unnecessary static assertions * Extract tile size computation logics * Make sure we can reuse padding flags in combine kernels * Rename variables * Use OaccDataType in BlockFmhaSplitKVCombinePipelineTileSizes<> * Remove unnecessary static assertion * Fix function name typo * Add constraint on kN1 template parameter * Hide K tile loading latency in earlier iteration * Fix wrong splitkv kernel name * Use s_shuffling to replace p_shuffling which removes the needs of cross-warp reduction * Rename pipeline * Fix wrong pipeline name attribute * Add GetAlignmentQ() for NWarpSShuffle pipeline * Separate Q tile into dram tile & register tile concepts * Remove non-squre warp gemm transpose c type alias * Fallback tile size changes for fmha fwd splitkv * Remove redundant change * Refine naming for the S tile * Use better naming of the S tile dstr (read from lds) * Share Q lds with K lds * Tiny change * Fix with using static_for for passing CI checking --------- Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
This commit is contained in:
@@ -71,7 +71,8 @@ struct FmhaFwdKernel
|
||||
using bfs = typename FmhaPipeline::BlockFmhaShape;
|
||||
using g0br = typename bfs::Gemm0BlockWarps;
|
||||
using g1br = typename bfs::Gemm1BlockWarps;
|
||||
using gwt = typename bfs::Gemm0WarpTile;
|
||||
using g0wt = typename bfs::Gemm0WarpTile;
|
||||
using g1wt = typename bfs::Gemm1WarpTile;
|
||||
#define _SS_ std::string
|
||||
#define _TS_ std::to_string
|
||||
auto pn = [&] () {
|
||||
@@ -88,7 +89,8 @@ struct FmhaFwdKernel
|
||||
_TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
|
||||
"r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
|
||||
"r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
|
||||
(kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
|
||||
"v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
|
||||
(BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
|
||||
|
||||
@@ -8,9 +8,11 @@ namespace ck_tile {
|
||||
template <typename TilePartitioner_, typename FmhaPipeline_, typename EpiloguePipeline_>
|
||||
struct FmhaFwdSplitKVCombineKernel
|
||||
{
|
||||
using TilePartitioner = remove_cvref_t<TilePartitioner_>;
|
||||
using FmhaPipeline = remove_cvref_t<FmhaPipeline_>;
|
||||
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
|
||||
using TilePartitioner = remove_cvref_t<TilePartitioner_>;
|
||||
using FmhaPipeline = remove_cvref_t<FmhaPipeline_>;
|
||||
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
|
||||
|
||||
static constexpr index_t kNumWarps = FmhaPipeline::kNumWarps;
|
||||
static constexpr index_t kBlockSize = FmhaPipeline::kBlockSize;
|
||||
static constexpr index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
|
||||
static_assert(kBlockPerCu > 0);
|
||||
@@ -50,8 +52,7 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
return
|
||||
_SS_("fmha_fwd_splitkv_combine_d") + _TS_(FmhaPipeline::kHeadDimV) + "_" + _SS_(t2s<ODataType>::name) +
|
||||
"_" + (kIsGroupMode ? "group" : "batch") + "_"
|
||||
"b" + _TS_(FmhaPipeline::kM0) + "x" +
|
||||
_TS_(FmhaPipeline::kN1) + "_" +
|
||||
"b" + _TS_(FmhaPipeline::kN1) + "_" +
|
||||
(kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) +
|
||||
_SS_(FmhaPipeline::name) +
|
||||
(pn.empty() ? "" : "_" + pn) +
|
||||
@@ -339,37 +340,56 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
number<FmhaPipeline::kAlignmentOacc>{},
|
||||
number<1>{});
|
||||
|
||||
// read 4 * (kM0, kN1) o_acc tiles simultaneously by 4 warps
|
||||
const auto o_acc_dram_view = pad_tensor_view(
|
||||
o_acc_dram_naive,
|
||||
make_tuple(number<1>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
|
||||
sequence<false, kPadSeqLenQ, kPadHeadDimV>{});
|
||||
make_tuple(
|
||||
number<kNumWarps>{}, number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
|
||||
sequence<true, kPadSeqLenQ, kPadHeadDimV>{});
|
||||
|
||||
const index_t padded_num_splits =
|
||||
o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<0>{}];
|
||||
const index_t padded_seqlen_q =
|
||||
o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<1>{}];
|
||||
const index_t padded_hdim_v =
|
||||
o_acc_dram_view.get_tensor_descriptor().get_lengths()[number<2>{}];
|
||||
|
||||
return transform_tensor_view(
|
||||
const index_t num_m_tiles = integer_divide_floor(padded_seqlen_q, FmhaPipeline::kM0);
|
||||
|
||||
// transform tensor view by following steps, given shape: (padded_num_splits,
|
||||
// padded_seqlen_q, padded_hdim_v)
|
||||
// 1. unmerge to (padded_num_splits, num_m_tiles, kM0, padded_hdim_v)
|
||||
// 2. transpose to (num_m_tiles, padded_num_splits, kM0, padded_hdim_v)
|
||||
// 3. merge to (num_m_tiles * padded_num_splits * kM0, padded_hdim_v)
|
||||
auto transposed = transform_tensor_view(
|
||||
o_acc_dram_view,
|
||||
make_tuple(make_merge_transform(make_tuple(kargs.num_splits, padded_seqlen_q)),
|
||||
make_tuple(make_pass_through_transform(padded_num_splits),
|
||||
make_unmerge_transform(make_tuple(num_m_tiles, FmhaPipeline::kM0)),
|
||||
make_pass_through_transform(padded_hdim_v)),
|
||||
make_tuple(sequence<0, 1>{}, sequence<2>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
|
||||
make_tuple(sequence<1>{}, sequence<0, 2>{}, sequence<3>{}));
|
||||
|
||||
return transform_tensor_view(
|
||||
transposed,
|
||||
make_tuple(make_merge_transform(
|
||||
make_tuple(num_m_tiles, padded_num_splits, FmhaPipeline::kM0)),
|
||||
make_pass_through_transform(padded_hdim_v)),
|
||||
make_tuple(sequence<0, 1, 2>{}, sequence<3>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
}();
|
||||
|
||||
auto lse_acc_dram_window = make_tile_window(
|
||||
lse_acc_dram,
|
||||
[&]() {
|
||||
return make_tuple(number<FmhaPipeline::kMaxSplits>{}, number<FmhaPipeline::kM0>{});
|
||||
}(),
|
||||
make_tuple(number<FmhaPipeline::kMaxSplits>{}, number<FmhaPipeline::kM0>{}),
|
||||
{0, i_m0});
|
||||
|
||||
const index_t padded_num_splits =
|
||||
integer_divide_ceil(kargs.num_splits, kNumWarps) * kNumWarps;
|
||||
|
||||
auto o_acc_dram_window = make_tile_window(
|
||||
o_acc_dram,
|
||||
[&]() {
|
||||
return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{});
|
||||
}(),
|
||||
{i_m0, i_n1});
|
||||
make_tuple(number<kNumWarps * FmhaPipeline::kM0>{}, number<FmhaPipeline::kN1>{}),
|
||||
{i_tile_m * padded_num_splits * FmhaPipeline::kM0, i_n1});
|
||||
|
||||
// LSE DRAM window
|
||||
auto lse_dram_window = [&, i_nhead_ = i_nhead]() {
|
||||
@@ -410,7 +430,6 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
identity{}, // lse_element_func
|
||||
composes(saturates<fp8_t>{}, scales{kargs.scale_o}), // o_acc_element_func
|
||||
kargs.num_splits,
|
||||
kargs.seqlen_q,
|
||||
smem_ptr);
|
||||
}
|
||||
else
|
||||
@@ -419,7 +438,6 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
o_acc_dram_window,
|
||||
lse_dram_window,
|
||||
kargs.num_splits,
|
||||
kargs.seqlen_q,
|
||||
smem_ptr);
|
||||
}
|
||||
}();
|
||||
|
||||
@@ -45,6 +45,7 @@ struct FmhaFwdSplitKVKernel
|
||||
static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
|
||||
static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
|
||||
static constexpr auto BiasEnum = FmhaPipeline::BiasEnum;
|
||||
static constexpr bool kStoreLSE = FmhaPipeline::kStoreLSE;
|
||||
static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
|
||||
static constexpr bool kIsPagedKV = FmhaPipeline::Problem::kIsPagedKV;
|
||||
|
||||
@@ -67,7 +68,8 @@ struct FmhaFwdSplitKVKernel
|
||||
using bfs = typename FmhaPipeline::BlockFmhaShape;
|
||||
using g0br = typename bfs::Gemm0BlockWarps;
|
||||
using g1br = typename bfs::Gemm1BlockWarps;
|
||||
using gwt = typename bfs::Gemm0WarpTile;
|
||||
using g0wt = typename bfs::Gemm0WarpTile;
|
||||
using g1wt = typename bfs::Gemm1WarpTile;
|
||||
#define _SS_ std::string
|
||||
#define _TS_ std::to_string
|
||||
auto pn = [&] () {
|
||||
@@ -84,11 +86,12 @@ struct FmhaFwdSplitKVKernel
|
||||
_TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
|
||||
"r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
|
||||
"r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(g0wt::at(ck_tile::number<0>{})) + "x" + _TS_(g0wt::at(ck_tile::number<1>{})) + "x" + _TS_(g0wt::at(ck_tile::number<2>{})) + "_" +
|
||||
"w" + _TS_(g1wt::at(ck_tile::number<0>{})) + "x" + _TS_(g1wt::at(ck_tile::number<1>{})) + "x" + _TS_(g1wt::at(ck_tile::number<2>{})) + "_" +
|
||||
(kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
|
||||
"v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
|
||||
(BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
|
||||
(kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
|
||||
(kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kStoreLSE ? "_lse" : "" ) + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
|
||||
#undef _SS_
|
||||
#undef _TS_
|
||||
// clang-format on
|
||||
|
||||
Reference in New Issue
Block a user