mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 14:11:29 +00:00
[CK_TILE] More fmha splitkv optimizations (#1588)
* Use pre-defined constants for readability * Use vector write for o_acc tensor * Remove no-longer used policy method * Deprecate no-longer used policy/pipeline * Specify gemm0/gemm1 block warps separately in codegen * Fix wrong ps_idx creation logic * Add single-warp block gemm * Supoprt single-warp gemm0 * Make MakeCBlockTile() as static method * Use MakeCBlockTile() to get underlying tile distribution * Use kNumGemm1Warps to compute # threads for gemm1 * Put normal case in the if clause * Refine fmha splitkv block mapping * Refine & fix the lse_acc/o_acc layout * Fix wrong LDS size for K tile * Use kK0=64 for hdim=128,256 fmha splitkv kernels * Use kK1=64 for hdim=32,64,128 fmha splitkv kernels * Undo kK0/kK1 changes * Use more reasonable GetAlignmentV() computation * Using store_tile() in fmha splitkv kernel epilogue
This commit is contained in:
@@ -557,33 +557,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
}
|
||||
#endif
|
||||
|
||||
struct
|
||||
{
|
||||
auto operator()(bool permute,
|
||||
ck_tile::index_t b /*batch*/,
|
||||
ck_tile::index_t h /*nhead*/,
|
||||
ck_tile::index_t s /*seqlen*/,
|
||||
ck_tile::index_t d /*hdim*/)
|
||||
{
|
||||
if(permute)
|
||||
return std::array<ck_tile::index_t, 4>{b, h, s, d};
|
||||
else
|
||||
return std::array<ck_tile::index_t, 4>{b, s, h, d};
|
||||
}
|
||||
|
||||
auto operator()(bool permute,
|
||||
ck_tile::index_t ns /*num_splits*/,
|
||||
ck_tile::index_t b /*batch*/,
|
||||
ck_tile::index_t h /*nhead*/,
|
||||
ck_tile::index_t s /*seqlen*/,
|
||||
ck_tile::index_t d /*hdim*/)
|
||||
{
|
||||
if(permute)
|
||||
return std::array<ck_tile::index_t, 5>{ns, b, h, s, d};
|
||||
else
|
||||
return std::array<ck_tile::index_t, 5>{ns, b, s, h, d};
|
||||
}
|
||||
} get_lengths;
|
||||
static const auto get_lengths = [](bool permute,
|
||||
ck_tile::index_t b /*batch*/,
|
||||
ck_tile::index_t h /*nhead*/,
|
||||
ck_tile::index_t s /*seqlen*/,
|
||||
ck_tile::index_t d /*hdim*/) {
|
||||
if(permute)
|
||||
return std::array<ck_tile::index_t, 4>{b, h, s, d};
|
||||
else
|
||||
return std::array<ck_tile::index_t, 4>{b, s, h, d};
|
||||
};
|
||||
|
||||
bool is_v_rowmajor = vlayout == std::string("r");
|
||||
|
||||
@@ -635,12 +618,15 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
|
||||
ck_tile::HostTensor<LSEDataType> lse_acc_host(
|
||||
1 < num_splits || use_kvcache
|
||||
? std::array<ck_tile::index_t, 4>{num_splits, shape_batch, nhead, shape_seqlen_q}
|
||||
? std::array<ck_tile::index_t, 4>{shape_batch, nhead, num_splits, shape_seqlen_q}
|
||||
: std::array<ck_tile::index_t, 4>{1, 1, 1, 1});
|
||||
ck_tile::HostTensor<OaccDataType> o_acc_host(
|
||||
1 < num_splits || use_kvcache
|
||||
? get_lengths(o_perm, num_splits, shape_batch, nhead, shape_seqlen_q, hdim_v)
|
||||
: std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
|
||||
1 < num_splits || use_kvcache ? std::array<ck_tile::index_t, 5>{shape_batch,
|
||||
nhead,
|
||||
num_splits,
|
||||
shape_seqlen_q,
|
||||
hdim_v}
|
||||
: std::array<ck_tile::index_t, 5>{1, 1, 1, 1, 1});
|
||||
|
||||
// batch mode of lse data layout is [batch, nhead, seqlen_q]
|
||||
// group mode of lse data layout is [nhead, total_seqlen_q]
|
||||
@@ -880,7 +866,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
}();
|
||||
const ck_tile::index_t stride_bias = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
|
||||
const ck_tile::index_t stride_randval = (max_seqlen_k);
|
||||
const ck_tile::index_t stride_o_acc = (o_perm ? hdim_v : nhead * hdim_v);
|
||||
const ck_tile::index_t stride_o_acc = (hdim_v);
|
||||
const ck_tile::index_t stride_o = (o_perm ? hdim_v : nhead * hdim_v);
|
||||
// setup nhead_stride_* arguments
|
||||
const ck_tile::index_t nhead_stride_q = (i_perm ? shape_seqlen_q * hdim_q : hdim_q);
|
||||
@@ -906,8 +892,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
(i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
|
||||
const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
|
||||
const ck_tile::index_t nhead_stride_lse = shape_seqlen_q;
|
||||
const ck_tile::index_t nhead_stride_lse_acc = shape_seqlen_q;
|
||||
const ck_tile::index_t nhead_stride_o_acc = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
|
||||
const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
|
||||
const ck_tile::index_t nhead_stride_o_acc = (num_splits * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t nhead_stride_o = (o_perm ? shape_seqlen_q * hdim_v : hdim_v);
|
||||
// setup batch_stride_* arguments
|
||||
const ck_tile::index_t batch_stride_q = (nhead * shape_seqlen_q * hdim_q);
|
||||
@@ -922,13 +908,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
const ck_tile::index_t batch_stride_bias = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
|
||||
const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
|
||||
const ck_tile::index_t batch_stride_lse = (nhead * shape_seqlen_q);
|
||||
const ck_tile::index_t batch_stride_lse_acc = (nhead * shape_seqlen_q);
|
||||
const ck_tile::index_t batch_stride_o_acc = (nhead * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
|
||||
const ck_tile::index_t batch_stride_o_acc = (nhead * num_splits * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t batch_stride_o = (nhead * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t batch_stride_block_table = (max_num_page_blocks / batch);
|
||||
// setup split_stride_* arguments (only used in split-kv kernel)
|
||||
const ck_tile::index_t split_stride_lse_acc = (shape_batch * nhead * shape_seqlen_q);
|
||||
const ck_tile::index_t split_stride_o_acc = (shape_batch * nhead * shape_seqlen_q * hdim_v);
|
||||
const ck_tile::index_t split_stride_lse_acc = (shape_seqlen_q);
|
||||
const ck_tile::index_t split_stride_o_acc = (shape_seqlen_q * hdim_v);
|
||||
|
||||
args.q_ptr = q_buf.GetDeviceBuffer();
|
||||
args.k_ptr = k_buf.GetDeviceBuffer();
|
||||
|
||||
Reference in New Issue
Block a user