mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 13:11:25 +00:00
[CK_TILE] Add PagedAttention kernels (#1387)
* Use dictionary to config all the functions * Add init codegen logic for fmha fwd appendkv * Call HIP_CHECK_ERROR() macro to get real source info * Setup meaningfull arguments * Sync kernel name with the codegen * Add knew/vnew tensors to the kernel argument * Fix wrong K values after appending * Fix vnew append errro * Extract common logics * Fix Vnew tile dstr for row major case * Conditionally add fwd_splitkv API in fmha_fwd example * Conditionally add call to fmha_fwd_splitkv() * Remove "EXAMPLE_" prefix of cmake variables * Regsiter API handlers automatically * Early return if 0 < s_k_new is not supported * Show message if we are ignoring option * Unify CMakeLists.txt coding style * Set num_splits=1 if split-kv is not supported * Add length/stride getters for HostTensor * Add RoPE example utilities * Add reference_rotary_position_embedding() (not implemented) * Finish reference_rotary_position_embedding() impl * Fix typo of HostTensor<>::get_length() * Fix compilation errors * Fix wrong answer when interleaved=false * Fix wrong answer when interleaved=true * Append K/V in the host verification code * Simplify K appending logics * Simplify v_host_ref definition * Reduce input/output dimensions * Rename function: add "batched" prefix * Apply RoPE on host side * Rename RoPE utility function * Fix wrong tensor size * Avoid invoking deprecated method 'find_module' * Pass RoPE kernel args * Create Rotary Cos/Sin tile windows in kernel * Add compute data type alias for RoPE * Randomly generate seqlen_knew if needed * Fix seqlen_knew enabling check logic * Add minimum seqlen_k to generate compliance kvcache * Fix compilation error in debug mode * Fix wrong boundaries * Fix wrong seqlen_k for kvcache * Rename variables used in distributio encoding * Fix rotary cos/sin tensor/tile size * Add constraint to the rotary_dim option * Remove unused inner namespace * Add dram distribution for rotary_cos/rotary_sin (interleaved) * Only apply interleaved RoPE on Knew for now * Fix wrong thread starting offset * Instantiate multiple kernels for RoPE approaches * Clean-up pipeline * Fix error in RoPE host reference * Handle RoPE half-rotated logics * Support 8x rotary_dim under half-rotated RoPE * Add comment * Apply elementwise function to the loaded tiles * Unify parameter/variable naming style * Remove constness from q_ptr * Add code blocks for q_tile * Apply RoPE to q_tile * Remove debug print code in kernel * Fix wrong knew/vnew appending positions * Use better naming for tile indices * Add make_tile_window() for adding distribution only * Skip code if # of block is more than needed * Move thread locating logics into policy * Remove always true static_assert() * Rename header * Rename RotaryEmbeddingEnum * Extract rotary embedding logic out * Re-order parameters * Align naming of some tile size constants * Rename more tile size constants * Fix wrong grid size * Fix wrong shape of knew_host/vnew_host * Fix wrong index into knew_host/vnew_host * Fix wrong rotary_cos/rotary_sin memory size for Q * Extract Q/Knew vector size to helper methods * Use different rotary_cos/rotary_sin distr for Q/Knew * Update host/device specifiers * Fix wrong data type for Q rotary_cos/rotary_sin * Remove RoPEComputeDataType type alias * Shift rotary_cos/rotary_sin by cache_seqlen_k * Add comment for why I just 't' for all padding flags * Align commit message to the real comment * Fix wrong pipeline * Rename utility function * Disable host verification if API not exist * Fix wrong rope key for fp8 pipeline * Allow only apply RoPE on Q (without append KV) * Add append-kv smoke tests * Remove debug statements * Remove more debug statements * Re-arrange the 'set +x' command * Remove no-longer used method in pipeline * Add missing init code * Refine pipeline padding settings * Enlarge rotary_dim limit (8 -> 16) * Enlarge KPerThread for rotary_interleaved=false * Update rotary_dim range in smoke_test_fwd.sh * Add template argument 'kIsPagedKV' for splitkv kernels * Launch splitkv kernel if given page_block_size * Fix wrong kernel name * Fix seqlen_k_min for pre-fill case (1 -> 0) * Add copy_const<> type trait * Add another make_tile_window() * Introduce 'TileWindowNavigator' types * Simplify TileWindowNavigator interfaces * Fix tile window navigation bugs * Disable calling fmha_fwd() * Remove ununnecessary data members * Simplify more make_tile_window() overloads * Move V tile through TileWindowNavigator * Fix uneven split checking logic * Move code after decide seqlen_q/seqlen_k * Make sure we always start reading complete tile * Use 128 as minimus page_block_size * Fix wrong origin for bias * Add batch_stride_k/batch_stride_v in group mode * Unify origin * Add missing kernel arguments for group mode * Add paged-kv codegen logic for appendkv kernels * Add block_table kernel args for appendkv kernel * Add tile navigators to the appendkv kernel * Fix wrong tensor descriptor lengths * Pass re-created tile window to pipeline * Fix wrong strides for appendkv kernel * Allow transit tile_window to another page-block * Handle cross-page-block write * Donot perform write again if already in last page-block * Always add fmha_fwd() api * Add missing group mode argument * Remove debug macro usages * Rename option s_k_new to s_knew * Separate splitkv/non-splitkv args/traits * Remove fmha_fwd_dispatch() * Fix compilation errors * Remove dropout code in splitkv kernel * Allow problem types without define kHasDropout attr * Use generic lambda to init traits objects * Separate more non-splitkv & splitkv traits/args * Display more info for specific kernels * Show more detailed warning message * Rename 'max_num_blocks' to 'max_num_page_blocks' * Remove no-longer used pipeline files * Wrap code by #if directives * Move functors to the begining of validation code * Use generic lambda to init all the api traits/args * Fix wrong seqlen for kvcache * Add missing comment * Rename TileWindowNavigator to PageBlockNavigator * Only expose necessary methods (not attributes) * Re-order pipeline paremeters * Refine smoke_test_fwd.sh * Fix wrong arugment count * Make tile window directly via PageBlockNavigator * Remove unused template paremeter * Remove group mode from appendkv kernel * Fix skcheck logic * Fix wrong syntax in skcheck expr * Use meaningful options in smoke test * Remove options * Fix formatting * Fix more format * Re-organize bash functions * Pass cache_batch_idx to kernels * Support cache_batch_idx in example * Fix compilation error * Add more appendkv test * Add more case for appendkv * Fix unexisted attribute * Remove 0 < seqlen_knew constraint * Clarify the case in warning message * Remove macro checking * Force batch mode when invoking appendkv & splitkv apis * Fix mode overriding logics * Fix wrong parameter name * Randomize seqlen_k if use kvcache * Use randomized seqlen_k for kvcache * Avoid using too small rotary_cos & rotary_sin * Rename parameter * Add seqlen_q & seqlen_k rules * Add comment * Add more comments * Fix compilation errors * Fix typo in comment * Remove type argument * Avoid seqlen_k=0 for kvcache * Revert "Avoid seqlen_k=0 for kvcache" This reverts commit21c4df89e4. * Fix wrong uneven split checking logics * Only randomize kvcache seqlen_k if 1 < batch * Return earlier if split is empty * Revert "Only randomize kvcache seqlen_k if 1 < batch" This reverts commitb9a4ab0d7e. * Re-order seqlen_k_start adjustment logics * Fix compilation errors * Re-format script * Find executable from folder automatically * Fix kvcache seqlen_k generating logic * Make comment more clear * Fix wrong knew/vew appending logic on host * Add s_barrier to sync threads * Revert "Add s_barrier to sync threads" This reverts commitd3f550f30c. * Support only using 1 row of rotary_cos/rotary_sin * Rotate Q in different way * Unify tensor view creation logics * Fix wrong argument * Add mask to switch how we use the rotary_cos/sin * Move attr from traits to problem * Move has_mask to fmha_fwd_appendkv_args * Support use uint32_t as SAD operand in Alibi<> * Use sad_u32() in splitkv kernels * Store tensor views in PageBlockNavigator * Use stored tensor view to update tile windows * Enlarge tensor view size * Remove debug code * Fix wrong tensor view size * Wrap tensor view into PageBlockNavigator * Add DataType member to PageBlockNavigator * Remove unnecessary member functions * Refind macro use * Fix typo * Add blank line between directives and actual code * Re-format files * Remove type in comment --------- Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com>
This commit is contained in:
679
include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
Normal file
679
include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
Normal file
@@ -0,0 +1,679 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/common.hpp"
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename TilePartitioner_, typename FmhaPipeline_>
|
||||
struct FmhaFwdAppendKVKernel
|
||||
{
|
||||
using TilePartitioner = ck_tile::remove_cvref_t<TilePartitioner_>;
|
||||
using FmhaPipeline = ck_tile::remove_cvref_t<FmhaPipeline_>;
|
||||
static constexpr ck_tile::index_t kBlockSize = FmhaPipeline::kBlockSize;
|
||||
static constexpr ck_tile::index_t kBlockPerCu = FmhaPipeline::kBlockPerCu;
|
||||
static_assert(kBlockPerCu > 0);
|
||||
static constexpr ck_tile::index_t kBlockPerCuInput = FmhaPipeline::Problem::kBlockPerCu;
|
||||
|
||||
using QDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::QDataType>;
|
||||
using KDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
|
||||
using VDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::VDataType>;
|
||||
|
||||
using VLayout = ck_tile::remove_cvref_t<typename FmhaPipeline::VLayout>;
|
||||
static constexpr bool kApplyRoPE = FmhaPipeline::RotaryEnum != RotaryEmbeddingEnum::NONE;
|
||||
static constexpr bool kIsPagedKV = FmhaPipeline::kIsPagedKV;
|
||||
|
||||
static constexpr bool kPadSeqLenQ = FmhaPipeline::kPadSeqLenQ;
|
||||
static constexpr bool kPadSeqLenK = FmhaPipeline::kPadSeqLenK;
|
||||
static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
|
||||
static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
|
||||
|
||||
// clang-format off
|
||||
template <typename T> struct t2s;
|
||||
template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
|
||||
template <> struct t2s<ck_tile::fp16_t> { static constexpr const char * name = "fp16"; };
|
||||
template <> struct t2s<ck_tile::bf16_t> { static constexpr const char * name = "bf16"; };
|
||||
template <> struct t2s<ck_tile::fp8_t> { static constexpr const char * name = "fp8"; };
|
||||
template <> struct t2s<ck_tile::bf8_t> { static constexpr const char * name = "bf8"; };
|
||||
// clang-format on
|
||||
|
||||
__host__ static std::string GetName()
|
||||
{
|
||||
// sync with generate.py
|
||||
// clang-format off
|
||||
|
||||
#define _SS_ std::string
|
||||
#define _TS_ std::to_string
|
||||
auto pn = [&] () {
|
||||
std::string n;
|
||||
if (kPadSeqLenQ) n += "s";
|
||||
if (kPadSeqLenK) n += "sk";
|
||||
if (kPadHeadDimQ) n += "d";
|
||||
if (kPadHeadDimV) n += "dv";
|
||||
return n.empty() ? n : std::string("p") + n; }();
|
||||
return
|
||||
_SS_("fmha_fwd_appendkv_d") + _TS_(FmhaPipeline::kK0) + "_" + _SS_(t2s<QDataType>::name) + "_"
|
||||
"b" + _TS_(FmhaPipeline::kM0) + "x" + _TS_(FmhaPipeline::kN0) + "x" + _TS_(FmhaPipeline::kK0) + "x" +
|
||||
_TS_(FmhaPipeline::kN1) + "_" + (kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) +
|
||||
"v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn)
|
||||
+ (!kApplyRoPE ? _SS_("") : (_SS_("_") + RotaryEmbeddingEnumToStr<FmhaPipeline::RotaryEnum>::name))
|
||||
+ (kIsPagedKV ? "_pagedkv" : "" );
|
||||
#undef _SS_
|
||||
#undef _TS_
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
template <ck_tile::index_t I> // to avoid duplicated base class prblem, introduce an template
|
||||
// arg
|
||||
struct EmptyKargs
|
||||
{
|
||||
};
|
||||
|
||||
// kargs use aggregate initializer, so no constructor will provided
|
||||
// use inheritance to minimize karg size
|
||||
// user need to use MakeKargs() function to create kargs.
|
||||
struct BasicKargs
|
||||
{
|
||||
void* q_ptr;
|
||||
void* k_ptr;
|
||||
const void* knew_ptr;
|
||||
void* v_ptr;
|
||||
const void* vnew_ptr;
|
||||
|
||||
const int32_t* seqlen_k_ptr;
|
||||
|
||||
ck_tile::index_t seqlen_q;
|
||||
ck_tile::index_t seqlen_k;
|
||||
ck_tile::index_t seqlen_knew;
|
||||
ck_tile::index_t hdim_q;
|
||||
ck_tile::index_t hdim_v;
|
||||
|
||||
ck_tile::index_t num_head_q;
|
||||
// for MQA/GQA, nhead could be different. This parameter is nhead_q / nhead_k
|
||||
// if this param is larger than 1, indicate MQA/GQA case
|
||||
ck_tile::index_t nhead_ratio_qk;
|
||||
|
||||
ck_tile::index_t stride_q;
|
||||
ck_tile::index_t stride_k;
|
||||
ck_tile::index_t stride_knew;
|
||||
ck_tile::index_t stride_v;
|
||||
ck_tile::index_t stride_vnew;
|
||||
|
||||
ck_tile::index_t nhead_stride_q;
|
||||
ck_tile::index_t nhead_stride_k;
|
||||
ck_tile::index_t nhead_stride_knew;
|
||||
ck_tile::index_t nhead_stride_v;
|
||||
ck_tile::index_t nhead_stride_vnew;
|
||||
|
||||
ck_tile::index_t batch_stride_q;
|
||||
ck_tile::index_t batch_stride_k;
|
||||
ck_tile::index_t batch_stride_knew;
|
||||
ck_tile::index_t batch_stride_v;
|
||||
ck_tile::index_t batch_stride_vnew;
|
||||
};
|
||||
|
||||
struct RoPEKargs
|
||||
{
|
||||
const void* rotary_cos_ptr;
|
||||
const void* rotary_sin_ptr;
|
||||
ck_tile::index_t rotary_dim;
|
||||
bool has_mask;
|
||||
};
|
||||
|
||||
struct PageBlockTableKargs
|
||||
{
|
||||
const int32_t* block_table_ptr;
|
||||
ck_tile::index_t batch_stride_block_table;
|
||||
ck_tile::index_t page_block_size;
|
||||
};
|
||||
|
||||
struct CacheBatchIdxKargs
|
||||
{
|
||||
const int32_t* cache_batch_idx;
|
||||
};
|
||||
|
||||
struct Kargs : BasicKargs,
|
||||
std::conditional_t<kApplyRoPE, RoPEKargs, EmptyKargs<0>>,
|
||||
std::conditional_t<kIsPagedKV, PageBlockTableKargs, CacheBatchIdxKargs>
|
||||
{
|
||||
};
|
||||
|
||||
__host__ static constexpr Kargs MakeKargs(void* q_ptr,
|
||||
void* k_ptr,
|
||||
const void* knew_ptr,
|
||||
void* v_ptr,
|
||||
const void* vnew_ptr,
|
||||
ck_tile::index_t seqlen_q,
|
||||
const void* seqlen_k_ptr,
|
||||
ck_tile::index_t seqlen_knew,
|
||||
ck_tile::index_t hdim_q,
|
||||
ck_tile::index_t hdim_v,
|
||||
ck_tile::index_t num_head_q,
|
||||
ck_tile::index_t nhead_ratio_qk,
|
||||
const void* rotary_cos_ptr,
|
||||
const void* rotary_sin_ptr,
|
||||
ck_tile::index_t rotary_dim,
|
||||
bool has_mask,
|
||||
const void* block_table_ptr,
|
||||
ck_tile::index_t batch_stride_block_table,
|
||||
ck_tile::index_t page_block_size,
|
||||
const void* cache_batch_idx,
|
||||
ck_tile::index_t stride_q,
|
||||
ck_tile::index_t stride_k,
|
||||
ck_tile::index_t stride_knew,
|
||||
ck_tile::index_t stride_v,
|
||||
ck_tile::index_t stride_vnew,
|
||||
ck_tile::index_t nhead_stride_q,
|
||||
ck_tile::index_t nhead_stride_k,
|
||||
ck_tile::index_t nhead_stride_knew,
|
||||
ck_tile::index_t nhead_stride_v,
|
||||
ck_tile::index_t nhead_stride_vnew,
|
||||
ck_tile::index_t batch_stride_q,
|
||||
ck_tile::index_t batch_stride_k,
|
||||
ck_tile::index_t batch_stride_knew,
|
||||
ck_tile::index_t batch_stride_v,
|
||||
ck_tile::index_t batch_stride_vnew)
|
||||
{
|
||||
Kargs kargs{
|
||||
{q_ptr,
|
||||
k_ptr,
|
||||
knew_ptr,
|
||||
v_ptr,
|
||||
vnew_ptr,
|
||||
reinterpret_cast<const int32_t*>(seqlen_k_ptr),
|
||||
seqlen_q,
|
||||
-1, // seqlen_k will be updated by content of seqlen_k_ptr
|
||||
seqlen_knew,
|
||||
hdim_q,
|
||||
hdim_v,
|
||||
num_head_q,
|
||||
nhead_ratio_qk,
|
||||
stride_q,
|
||||
stride_k,
|
||||
stride_knew,
|
||||
stride_v,
|
||||
stride_vnew,
|
||||
nhead_stride_q,
|
||||
nhead_stride_k,
|
||||
nhead_stride_knew,
|
||||
nhead_stride_v,
|
||||
nhead_stride_vnew,
|
||||
batch_stride_q,
|
||||
batch_stride_k,
|
||||
batch_stride_knew,
|
||||
batch_stride_v,
|
||||
batch_stride_vnew}, // args for common karg
|
||||
{}, // placeholder for rope
|
||||
{} // placeholder for paged-block table or cache_batch_idx
|
||||
};
|
||||
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
kargs.rotary_cos_ptr = rotary_cos_ptr;
|
||||
kargs.rotary_sin_ptr = rotary_sin_ptr;
|
||||
kargs.rotary_dim = rotary_dim;
|
||||
kargs.has_mask = has_mask;
|
||||
}
|
||||
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
kargs.block_table_ptr = reinterpret_cast<const int32_t*>(block_table_ptr);
|
||||
kargs.batch_stride_block_table = batch_stride_block_table;
|
||||
kargs.page_block_size = page_block_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
kargs.cache_batch_idx = reinterpret_cast<const int32_t*>(cache_batch_idx);
|
||||
}
|
||||
|
||||
return kargs;
|
||||
}
|
||||
|
||||
__host__ static constexpr auto GridSize(ck_tile::index_t batch_size,
|
||||
ck_tile::index_t nhead,
|
||||
ck_tile::index_t seqlen_q,
|
||||
ck_tile::index_t seqlen_knew)
|
||||
{
|
||||
return TilePartitioner::GridSize(batch_size, nhead, seqlen_q, seqlen_knew);
|
||||
}
|
||||
|
||||
__host__ static constexpr auto BlockSize() { return dim3(kBlockSize); }
|
||||
|
||||
CK_TILE_DEVICE void operator()(Kargs kargs) const
|
||||
{
|
||||
// divide problem
|
||||
const auto [i_tile, i_nhead, i_batch] = TilePartitioner{}();
|
||||
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
|
||||
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
|
||||
|
||||
const index_t i_cache_batch = [&, i_batch_ = i_batch] {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return i_batch_;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (kargs.cache_batch_idx != nullptr ? kargs.cache_batch_idx[i_batch_]
|
||||
: i_batch_);
|
||||
}
|
||||
}();
|
||||
|
||||
const long_index_t batch_offset_q =
|
||||
static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
|
||||
const long_index_t batch_offset_k =
|
||||
static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_k;
|
||||
const long_index_t batch_offset_knew =
|
||||
static_cast<long_index_t>(i_batch) * kargs.batch_stride_knew;
|
||||
const long_index_t batch_offset_v =
|
||||
static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_v;
|
||||
const long_index_t batch_offset_vnew =
|
||||
static_cast<long_index_t>(i_batch) * kargs.batch_stride_vnew;
|
||||
|
||||
kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
|
||||
|
||||
// for simplicity, batch stride we just modify the pointer
|
||||
QDataType* q_ptr = reinterpret_cast<QDataType*>(kargs.q_ptr) +
|
||||
static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
|
||||
batch_offset_q;
|
||||
KDataType* k_ptr =
|
||||
reinterpret_cast<KDataType*>(kargs.k_ptr) +
|
||||
static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
|
||||
batch_offset_k;
|
||||
const KDataType* knew_ptr =
|
||||
reinterpret_cast<const KDataType*>(kargs.knew_ptr) +
|
||||
static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_knew +
|
||||
batch_offset_knew;
|
||||
VDataType* v_ptr =
|
||||
reinterpret_cast<VDataType*>(kargs.v_ptr) +
|
||||
static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
|
||||
batch_offset_v;
|
||||
const VDataType* vnew_ptr =
|
||||
reinterpret_cast<const VDataType*>(kargs.vnew_ptr) +
|
||||
static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_vnew +
|
||||
batch_offset_vnew;
|
||||
|
||||
// Q/K/V DRAM and DRAM window
|
||||
const auto q_dram = [&]() {
|
||||
const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
q_ptr,
|
||||
make_tuple(kargs.seqlen_q, kargs.hdim_q),
|
||||
make_tuple(kargs.stride_q, 1),
|
||||
number<FmhaPipeline::kAlignmentQ>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(
|
||||
q_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
|
||||
sequence<kPadSeqLenQ, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
const auto make_k_dram = [&](KDataType* data, index_t height) {
|
||||
const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(height, kargs.hdim_q),
|
||||
make_tuple(kargs.stride_k, 1),
|
||||
number<FmhaPipeline::kAlignmentK>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(
|
||||
k_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
|
||||
sequence<kPadSeqLenK, kPadHeadDimQ>{});
|
||||
};
|
||||
const auto k_dram = [&]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return make_k_dram(nullptr, kargs.page_block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_k_dram(k_ptr, kargs.seqlen_k + kargs.seqlen_knew);
|
||||
}
|
||||
}();
|
||||
|
||||
const auto knew_dram = [&]() {
|
||||
const auto knew_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
knew_ptr,
|
||||
make_tuple(kargs.seqlen_knew, kargs.hdim_q),
|
||||
make_tuple(kargs.stride_knew, 1),
|
||||
number<FmhaPipeline::kAlignmentK>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(
|
||||
knew_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
|
||||
sequence<kPadSeqLenK, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
const auto make_v_dram = [&](VDataType* data, index_t length) {
|
||||
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(length, kargs.hdim_v),
|
||||
make_tuple(kargs.stride_v, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
|
||||
const auto v_dram_transposed =
|
||||
transform_tensor_view(v_dram_naive,
|
||||
make_tuple(make_pass_through_transform(kargs.hdim_v),
|
||||
make_pass_through_transform(length)),
|
||||
make_tuple(sequence<1>{}, sequence<0>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
|
||||
return pad_tensor_view(
|
||||
v_dram_transposed,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
sequence<kPadHeadDimV, kPadSeqLenK>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(kargs.hdim_v, length),
|
||||
make_tuple(kargs.stride_v, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(
|
||||
v_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
sequence<kPadHeadDimV, kPadSeqLenK>{});
|
||||
}
|
||||
};
|
||||
const auto v_dram = [&]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return make_v_dram(nullptr, kargs.page_block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_v_dram(v_ptr, kargs.seqlen_k + kargs.seqlen_knew);
|
||||
}
|
||||
}();
|
||||
|
||||
const auto vnew_dram = [&]() {
|
||||
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
const auto vnew_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
vnew_ptr,
|
||||
make_tuple(kargs.seqlen_knew, kargs.hdim_v),
|
||||
make_tuple(kargs.stride_vnew, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
|
||||
const auto vnew_dram_transposed = transform_tensor_view(
|
||||
vnew_dram_naive,
|
||||
make_tuple(make_pass_through_transform(kargs.hdim_v),
|
||||
make_pass_through_transform(kargs.seqlen_knew)),
|
||||
make_tuple(sequence<1>{}, sequence<0>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
|
||||
return pad_tensor_view(
|
||||
vnew_dram_transposed,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
sequence<kPadHeadDimV, kPadSeqLenK>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto vnew_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
vnew_ptr,
|
||||
make_tuple(kargs.hdim_v, kargs.seqlen_knew),
|
||||
make_tuple(kargs.stride_vnew, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(
|
||||
vnew_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
sequence<kPadHeadDimV, kPadSeqLenK>{});
|
||||
}
|
||||
}();
|
||||
|
||||
constexpr auto q_rotary_cos_sin_dram_window_lengths =
|
||||
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0 / 2>{});
|
||||
const auto q_rotary_cos_dram_window = [&]() {
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
const auto rotary_cos_dram_native =
|
||||
make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const QDataType*>(kargs.rotary_cos_ptr) +
|
||||
kargs.seqlen_k * (kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.seqlen_q, kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.has_mask * (kargs.rotary_dim / 2), 1),
|
||||
number<8>{},
|
||||
number<1>{});
|
||||
|
||||
const auto rotary_cos_dram = [&]() {
|
||||
return pad_tensor_view(rotary_cos_dram_native,
|
||||
q_rotary_cos_sin_dram_window_lengths,
|
||||
sequence<kPadSeqLenQ, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
return make_tile_window(
|
||||
rotary_cos_dram, q_rotary_cos_sin_dram_window_lengths, {i_m0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_null_tile_window(q_rotary_cos_sin_dram_window_lengths);
|
||||
}
|
||||
}();
|
||||
const auto q_rotary_sin_dram_window = [&]() {
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
const auto rotary_sin_dram_native =
|
||||
make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const QDataType*>(kargs.rotary_sin_ptr) +
|
||||
kargs.seqlen_k * (kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.seqlen_q, kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.has_mask * (kargs.rotary_dim / 2), 1),
|
||||
number<8>{},
|
||||
number<1>{});
|
||||
|
||||
const auto rotary_sin_dram = [&]() {
|
||||
return pad_tensor_view(rotary_sin_dram_native,
|
||||
q_rotary_cos_sin_dram_window_lengths,
|
||||
sequence<kPadSeqLenQ, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
return make_tile_window(
|
||||
rotary_sin_dram, q_rotary_cos_sin_dram_window_lengths, {i_m0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_null_tile_window(q_rotary_cos_sin_dram_window_lengths);
|
||||
}
|
||||
}();
|
||||
|
||||
constexpr auto knew_rotary_cos_sin_dram_window_lengths =
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0 / 2>{});
|
||||
const auto knew_rotary_cos_dram_window = [&]() {
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
const auto rotary_cos_dram_native =
|
||||
make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const KDataType*>(kargs.rotary_cos_ptr) +
|
||||
kargs.seqlen_k * (kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.seqlen_knew, kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.rotary_dim / 2, 1),
|
||||
number<8>{},
|
||||
number<1>{});
|
||||
|
||||
const auto rotary_cos_dram = [&]() {
|
||||
return pad_tensor_view(rotary_cos_dram_native,
|
||||
knew_rotary_cos_sin_dram_window_lengths,
|
||||
sequence<kPadSeqLenK, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
return make_tile_window(
|
||||
rotary_cos_dram, knew_rotary_cos_sin_dram_window_lengths, {i_n0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_null_tile_window(knew_rotary_cos_sin_dram_window_lengths);
|
||||
}
|
||||
}();
|
||||
const auto knew_rotary_sin_dram_window = [&]() {
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
const auto rotary_sin_dram_native =
|
||||
make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const KDataType*>(kargs.rotary_sin_ptr) +
|
||||
kargs.seqlen_k * (kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.seqlen_knew, kargs.rotary_dim / 2),
|
||||
make_tuple(kargs.rotary_dim / 2, 1),
|
||||
number<8>{},
|
||||
number<1>{});
|
||||
|
||||
const auto rotary_sin_dram = [&]() {
|
||||
return pad_tensor_view(rotary_sin_dram_native,
|
||||
knew_rotary_cos_sin_dram_window_lengths,
|
||||
sequence<kPadSeqLenK, kPadHeadDimQ>{});
|
||||
}();
|
||||
|
||||
return make_tile_window(
|
||||
rotary_sin_dram, knew_rotary_cos_sin_dram_window_lengths, {i_n0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_null_tile_window(knew_rotary_cos_sin_dram_window_lengths);
|
||||
}
|
||||
}();
|
||||
|
||||
auto k_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
const auto* block_indices =
|
||||
reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
|
||||
i_batch_ * kargs.batch_stride_block_table;
|
||||
const index_t num_blocks =
|
||||
integer_divide_ceil(kargs.seqlen_k + kargs.seqlen_knew, kargs.page_block_size);
|
||||
|
||||
const long_index_t fixed_offset =
|
||||
static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
|
||||
kargs.nhead_stride_k;
|
||||
|
||||
return make_page_block_navigator<KDataType, 0>(
|
||||
kargs.k_ptr,
|
||||
kargs.batch_stride_k,
|
||||
fixed_offset,
|
||||
block_indices,
|
||||
num_blocks,
|
||||
kargs.page_block_size,
|
||||
k_dram,
|
||||
make_k_dram(nullptr,
|
||||
(kargs.seqlen_k + kargs.seqlen_knew) -
|
||||
(num_blocks - 1) * kargs.page_block_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_page_block_navigator(k_dram);
|
||||
}
|
||||
}();
|
||||
|
||||
auto v_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
const auto* block_indices =
|
||||
reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
|
||||
i_batch_ * kargs.batch_stride_block_table;
|
||||
const index_t num_blocks =
|
||||
integer_divide_ceil(kargs.seqlen_k + kargs.seqlen_knew, kargs.page_block_size);
|
||||
|
||||
const long_index_t fixed_offset =
|
||||
static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
|
||||
kargs.nhead_stride_v;
|
||||
|
||||
return make_page_block_navigator<VDataType, 1>(
|
||||
kargs.v_ptr,
|
||||
kargs.batch_stride_v,
|
||||
fixed_offset,
|
||||
block_indices,
|
||||
num_blocks,
|
||||
kargs.page_block_size,
|
||||
v_dram,
|
||||
make_v_dram(nullptr,
|
||||
(kargs.seqlen_k + kargs.seqlen_knew) -
|
||||
(num_blocks - 1) * kargs.page_block_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_page_block_navigator(v_dram);
|
||||
}
|
||||
}();
|
||||
|
||||
auto q_dram_window =
|
||||
make_tile_window(q_dram,
|
||||
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{}),
|
||||
{i_m0, 0});
|
||||
|
||||
const bool skip_append_kv = kargs.seqlen_knew <= i_n0;
|
||||
// window origin = (0, 0) if no work to do for current block
|
||||
auto [i_page_block_k, k_dram_window] = k_page_block_navigator.make_tile_window(
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
|
||||
{!skip_append_kv * (kargs.seqlen_k + i_n0), 0});
|
||||
|
||||
auto knew_dram_window =
|
||||
make_tile_window(knew_dram,
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
|
||||
{i_n0, 0});
|
||||
|
||||
// window origin = (0, 0) if no work to do for current block
|
||||
auto [i_page_block_v, v_dram_window] = v_page_block_navigator.make_tile_window(
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
{0, !skip_append_kv * (kargs.seqlen_k + i_n0)});
|
||||
|
||||
auto vnew_dram_window =
|
||||
make_tile_window(vnew_dram,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kN0>{}),
|
||||
{0, i_n0});
|
||||
|
||||
if constexpr(kApplyRoPE)
|
||||
{
|
||||
FmhaPipeline{}(q_dram_window,
|
||||
k_dram_window,
|
||||
i_page_block_k,
|
||||
k_page_block_navigator,
|
||||
knew_dram_window,
|
||||
v_dram_window,
|
||||
i_page_block_v,
|
||||
v_page_block_navigator,
|
||||
vnew_dram_window,
|
||||
q_rotary_cos_dram_window,
|
||||
q_rotary_sin_dram_window,
|
||||
knew_rotary_cos_dram_window,
|
||||
knew_rotary_sin_dram_window,
|
||||
kargs.rotary_dim,
|
||||
kargs.seqlen_q <= i_m0,
|
||||
skip_append_kv);
|
||||
}
|
||||
else
|
||||
{
|
||||
FmhaPipeline{}(q_dram_window,
|
||||
k_dram_window,
|
||||
i_page_block_k,
|
||||
k_page_block_navigator,
|
||||
knew_dram_window,
|
||||
v_dram_window,
|
||||
i_page_block_v,
|
||||
v_page_block_navigator,
|
||||
vnew_dram_window,
|
||||
q_rotary_cos_dram_window,
|
||||
q_rotary_sin_dram_window,
|
||||
knew_rotary_cos_dram_window,
|
||||
knew_rotary_sin_dram_window,
|
||||
0, // rotary_dim not used
|
||||
kargs.seqlen_q <= i_m0,
|
||||
skip_append_kv);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,42 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <index_t kM0_, index_t kN0_, index_t kK0_, index_t kN1_>
|
||||
struct FmhaFwdAppendKVTilePartitioner
|
||||
{
|
||||
static constexpr ck_tile::index_t kM0 = kM0_;
|
||||
static constexpr ck_tile::index_t kN0 = kN0_;
|
||||
static constexpr ck_tile::index_t kK0 = kK0_;
|
||||
static constexpr ck_tile::index_t kN1 = kN1_;
|
||||
|
||||
static_assert(kK0 == kN1);
|
||||
|
||||
CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
|
||||
ck_tile::index_t nhead,
|
||||
ck_tile::index_t seqlen_q,
|
||||
ck_tile::index_t seqlen_knew)
|
||||
{
|
||||
// TODO: this may need tuning
|
||||
return dim3(std::max(ck_tile::integer_divide_ceil(seqlen_q, kM0),
|
||||
ck_tile::integer_divide_ceil(seqlen_knew, kN0)),
|
||||
nhead,
|
||||
batch_size);
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE auto operator()()
|
||||
{
|
||||
const index_t i_tile = blockIdx.x;
|
||||
const index_t i_nhead = blockIdx.y;
|
||||
const index_t i_batch = blockIdx.z;
|
||||
|
||||
return ck_tile::make_tuple(i_tile, i_nhead, i_batch);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -32,8 +32,6 @@ struct FmhaFwdSplitKVKernel
|
||||
using KDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::KDataType>;
|
||||
using VDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::VDataType>;
|
||||
using BiasDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::BiasDataType>;
|
||||
using RandValOutputDataType =
|
||||
ck_tile::remove_cvref_t<typename FmhaPipeline::RandValOutputDataType>;
|
||||
using LSEDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::LSEDataType>;
|
||||
using SaccDataType = ck_tile::remove_cvref_t<typename FmhaPipeline::SaccDataType>;
|
||||
using OaccDataType = remove_cvref_t<typename FmhaPipeline::OaccDataType>;
|
||||
@@ -46,8 +44,10 @@ struct FmhaFwdSplitKVKernel
|
||||
static constexpr bool kPadHeadDimQ = FmhaPipeline::kPadHeadDimQ;
|
||||
static constexpr bool kPadHeadDimV = FmhaPipeline::kPadHeadDimV;
|
||||
static constexpr auto BiasEnum = FmhaPipeline::BiasEnum;
|
||||
static constexpr bool kHasDropout = FmhaPipeline::kHasDropout;
|
||||
static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
|
||||
static constexpr bool kIsPagedKV = FmhaPipeline::Problem::kIsPagedKV;
|
||||
static_assert(!kIsGroupMode || (kIsGroupMode && !kIsPagedKV),
|
||||
"paged-kvcache only supported by batch mode kernels");
|
||||
using FmhaMask = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
|
||||
static constexpr bool kHasMask = FmhaMask::IsMasking;
|
||||
|
||||
@@ -85,8 +85,8 @@ struct FmhaFwdSplitKVKernel
|
||||
"w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
|
||||
(kBlockPerCuInput == -1 ? "" : ("o" + _TS_(kBlockPerCu) + "_")) + _SS_(FmhaPipeline::name) + "_" +
|
||||
"v" + (std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor> ? "r" : "c") + (pn.empty() ? "" : "_" + pn) +
|
||||
(BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
|
||||
(kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kHasDropout ? "_dropout" : "" ) + (kDoFp8StaticQuant ? "_squant" : "" );
|
||||
(BiasEnum == BlockAttentionBiasEnum::NO_BIAS ? _SS_("") : (_SS_("_") + BlockAttentionBiasEnumToStr<BiasEnum>::name)) +
|
||||
(kHasMask ? "_" + _SS_(FmhaMask::name) : "") + (kDoFp8StaticQuant ? "_squant" : "") + (kIsPagedKV ? "_pagedkv" : "" );
|
||||
#undef _SS_
|
||||
#undef _TS_
|
||||
// clang-format on
|
||||
@@ -110,7 +110,6 @@ struct FmhaFwdSplitKVKernel
|
||||
void* o_acc_ptr;
|
||||
|
||||
ck_tile::index_t batch;
|
||||
ck_tile::index_t max_seqlen_q;
|
||||
|
||||
ck_tile::index_t seqlen_q;
|
||||
ck_tile::index_t seqlen_k;
|
||||
@@ -136,6 +135,7 @@ struct FmhaFwdSplitKVKernel
|
||||
ck_tile::index_t nhead_stride_lse_acc;
|
||||
ck_tile::index_t nhead_stride_o_acc;
|
||||
|
||||
ck_tile::index_t batch_stride_lse_acc;
|
||||
ck_tile::index_t batch_stride_o_acc;
|
||||
|
||||
ck_tile::index_t split_stride_lse_acc;
|
||||
@@ -173,32 +173,16 @@ struct FmhaFwdSplitKVKernel
|
||||
float scale_p;
|
||||
};
|
||||
|
||||
struct CommonDropoutKargs
|
||||
struct PageBlockTableKargs
|
||||
{
|
||||
void init_dropout(const float p_drop,
|
||||
const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
|
||||
{
|
||||
float p_undrop = 1.0 - p_drop;
|
||||
p_undrop_in_uint8_t =
|
||||
uint8_t(std::floor(p_undrop * std::numeric_limits<uint8_t>::max()));
|
||||
rp_undrop = 1.0 / p_undrop;
|
||||
|
||||
drop_seed = std::get<0>(drop_seed_offset);
|
||||
drop_offset = std::get<1>(drop_seed_offset);
|
||||
}
|
||||
float rp_undrop = 1;
|
||||
uint8_t p_undrop_in_uint8_t = std::numeric_limits<uint8_t>::max();
|
||||
bool is_store_randval = false;
|
||||
uint64_t drop_seed = 1;
|
||||
uint64_t drop_offset = 0;
|
||||
void* rand_val_ptr = nullptr;
|
||||
|
||||
ck_tile::index_t stride_randval = 0;
|
||||
ck_tile::index_t nhead_stride_randval = 0;
|
||||
const int32_t* block_table_ptr;
|
||||
ck_tile::index_t batch_stride_block_table;
|
||||
ck_tile::index_t page_block_size;
|
||||
};
|
||||
struct BatchModeDropoutKargs : CommonDropoutKargs
|
||||
|
||||
struct CacheBatchIdxKargs
|
||||
{
|
||||
ck_tile::index_t batch_stride_randval = 0;
|
||||
const int32_t* cache_batch_idx;
|
||||
};
|
||||
|
||||
struct BatchModeKargs
|
||||
@@ -210,12 +194,13 @@ struct FmhaFwdSplitKVKernel
|
||||
EmptyKargs<0>>>,
|
||||
std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
|
||||
std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
|
||||
std::conditional_t<kHasDropout, BatchModeDropoutKargs, EmptyKargs<3>>
|
||||
std::conditional_t<kIsPagedKV, PageBlockTableKargs, CacheBatchIdxKargs>
|
||||
{
|
||||
const int32_t* seqlen_k_ptr;
|
||||
|
||||
ck_tile::index_t batch_stride_q;
|
||||
ck_tile::index_t batch_stride_k;
|
||||
ck_tile::index_t batch_stride_v;
|
||||
ck_tile::index_t batch_stride_lse_acc;
|
||||
};
|
||||
|
||||
struct GroupModeKargs
|
||||
@@ -226,12 +211,14 @@ struct FmhaFwdSplitKVKernel
|
||||
AlibiKargs,
|
||||
EmptyKargs<0>>>,
|
||||
std::conditional_t<kHasMask, MaskKargs, EmptyKargs<1>>,
|
||||
std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>,
|
||||
std::conditional_t<kHasDropout, CommonDropoutKargs, EmptyKargs<3>>
|
||||
std::conditional_t<kDoFp8StaticQuant, Fp8StaticQuantKargs, EmptyKargs<2>>
|
||||
{
|
||||
const int32_t* seqstart_q_ptr;
|
||||
const int32_t* seqstart_k_ptr;
|
||||
const int32_t* seqlen_k_ptr;
|
||||
|
||||
ck_tile::index_t batch_stride_k;
|
||||
ck_tile::index_t batch_stride_v;
|
||||
};
|
||||
|
||||
using Kargs = std::conditional_t<kIsGroupMode, GroupModeKargs, BatchModeKargs>;
|
||||
@@ -242,48 +229,45 @@ struct FmhaFwdSplitKVKernel
|
||||
const void* k_ptr,
|
||||
const void* v_ptr,
|
||||
const void* bias_ptr,
|
||||
void* rand_val_ptr,
|
||||
void* lse_acc_ptr,
|
||||
void* o_acc_ptr,
|
||||
ck_tile::index_t batch,
|
||||
ck_tile::index_t max_seqlen_q,
|
||||
ck_tile::index_t seqlen_q,
|
||||
ck_tile::index_t seqlen_k,
|
||||
ck_tile::index_t seqlen_k, // only used if 'seqlen_k_ptr' is not specified
|
||||
const void* seqlen_k_ptr, // only used for (paged-) kvcache
|
||||
ck_tile::index_t hdim_q,
|
||||
ck_tile::index_t hdim_v,
|
||||
ck_tile::index_t num_head_q,
|
||||
ck_tile::index_t nhead_ratio_qk,
|
||||
ck_tile::index_t num_splits,
|
||||
const void* block_table_ptr,
|
||||
ck_tile::index_t batch_stride_block_table,
|
||||
ck_tile::index_t page_block_size,
|
||||
const void* cache_batch_idx,
|
||||
float scale_s,
|
||||
float scale_p,
|
||||
ck_tile::index_t stride_q,
|
||||
ck_tile::index_t stride_k,
|
||||
ck_tile::index_t stride_v,
|
||||
ck_tile::index_t stride_bias,
|
||||
ck_tile::index_t stride_randval,
|
||||
ck_tile::index_t stride_o_acc,
|
||||
ck_tile::index_t nhead_stride_q,
|
||||
ck_tile::index_t nhead_stride_k,
|
||||
ck_tile::index_t nhead_stride_v,
|
||||
ck_tile::index_t nhead_stride_bias,
|
||||
ck_tile::index_t nhead_stride_randval,
|
||||
ck_tile::index_t nhead_stride_lse_acc,
|
||||
ck_tile::index_t nhead_stride_o_acc,
|
||||
ck_tile::index_t batch_stride_q,
|
||||
ck_tile::index_t batch_stride_k,
|
||||
ck_tile::index_t batch_stride_v,
|
||||
ck_tile::index_t batch_stride_bias,
|
||||
ck_tile::index_t batch_stride_randval,
|
||||
ck_tile::index_t batch_stride_lse_acc,
|
||||
ck_tile::index_t batch_stride_o_acc,
|
||||
ck_tile::index_t split_stride_lse_acc,
|
||||
ck_tile::index_t split_stride_o_acc,
|
||||
ck_tile::index_t window_size_left,
|
||||
ck_tile::index_t window_size_right,
|
||||
ck_tile::index_t mask_type,
|
||||
float p_drop,
|
||||
bool s_randval,
|
||||
const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
|
||||
ck_tile::index_t mask_type)
|
||||
{
|
||||
Kargs kargs{{q_ptr,
|
||||
k_ptr,
|
||||
@@ -291,7 +275,6 @@ struct FmhaFwdSplitKVKernel
|
||||
lse_acc_ptr,
|
||||
o_acc_ptr,
|
||||
batch,
|
||||
max_seqlen_q,
|
||||
seqlen_q,
|
||||
seqlen_k,
|
||||
hdim_q,
|
||||
@@ -313,17 +296,18 @@ struct FmhaFwdSplitKVKernel
|
||||
nhead_stride_v,
|
||||
nhead_stride_lse_acc,
|
||||
nhead_stride_o_acc,
|
||||
batch_stride_lse_acc,
|
||||
batch_stride_o_acc,
|
||||
split_stride_lse_acc,
|
||||
split_stride_o_acc}, // args for common karg
|
||||
{}, // placeholder for bias
|
||||
{}, // placeholder for mask
|
||||
{}, // placeholder for fp8_static_quant args
|
||||
{}, // placeholder for dropout
|
||||
{}, // placeholder for paged-block table or cache_batch_idx
|
||||
reinterpret_cast<const int32_t*>(seqlen_k_ptr),
|
||||
batch_stride_q,
|
||||
batch_stride_k,
|
||||
batch_stride_v,
|
||||
batch_stride_lse_acc};
|
||||
batch_stride_v};
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
{
|
||||
@@ -347,14 +331,15 @@ struct FmhaFwdSplitKVKernel
|
||||
{
|
||||
kargs.scale_p = scale_p;
|
||||
}
|
||||
if constexpr(kHasDropout)
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
kargs.init_dropout(p_drop, drop_seed_offset);
|
||||
kargs.rand_val_ptr = rand_val_ptr;
|
||||
kargs.stride_randval = stride_randval;
|
||||
kargs.nhead_stride_randval = nhead_stride_randval;
|
||||
kargs.batch_stride_randval = batch_stride_randval;
|
||||
kargs.is_store_randval = s_randval;
|
||||
kargs.block_table_ptr = reinterpret_cast<const int32_t*>(block_table_ptr);
|
||||
kargs.batch_stride_block_table = batch_stride_block_table;
|
||||
kargs.page_block_size = page_block_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
kargs.cache_batch_idx = reinterpret_cast<const int32_t*>(cache_batch_idx);
|
||||
}
|
||||
|
||||
return kargs;
|
||||
@@ -366,11 +351,9 @@ struct FmhaFwdSplitKVKernel
|
||||
const void* k_ptr,
|
||||
const void* v_ptr,
|
||||
const void* bias_ptr,
|
||||
void* rand_val_ptr,
|
||||
void* lse_acc_ptr,
|
||||
void* o_acc_ptr,
|
||||
ck_tile::index_t batch,
|
||||
ck_tile::index_t max_seqlen_q,
|
||||
const void* seqstart_q_ptr,
|
||||
const void* seqstart_k_ptr,
|
||||
const void* seqlen_k_ptr,
|
||||
@@ -385,24 +368,22 @@ struct FmhaFwdSplitKVKernel
|
||||
ck_tile::index_t stride_k,
|
||||
ck_tile::index_t stride_v,
|
||||
ck_tile::index_t stride_bias,
|
||||
ck_tile::index_t stride_randval,
|
||||
ck_tile::index_t stride_o_acc,
|
||||
ck_tile::index_t nhead_stride_q,
|
||||
ck_tile::index_t nhead_stride_k,
|
||||
ck_tile::index_t nhead_stride_v,
|
||||
ck_tile::index_t nhead_stride_bias,
|
||||
ck_tile::index_t nhead_stride_randval,
|
||||
ck_tile::index_t nhead_stride_lse_acc,
|
||||
ck_tile::index_t nhead_stride_o_acc,
|
||||
ck_tile::index_t batch_stride_k,
|
||||
ck_tile::index_t batch_stride_v,
|
||||
ck_tile::index_t batch_stride_lse_acc,
|
||||
ck_tile::index_t batch_stride_o_acc,
|
||||
ck_tile::index_t split_stride_lse_acc,
|
||||
ck_tile::index_t split_stride_o_acc,
|
||||
ck_tile::index_t window_size_left,
|
||||
ck_tile::index_t window_size_right,
|
||||
ck_tile::index_t mask_type,
|
||||
float p_drop,
|
||||
bool s_randval,
|
||||
const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
|
||||
ck_tile::index_t mask_type)
|
||||
{
|
||||
Kargs kargs{{q_ptr,
|
||||
k_ptr,
|
||||
@@ -410,9 +391,8 @@ struct FmhaFwdSplitKVKernel
|
||||
lse_acc_ptr,
|
||||
o_acc_ptr,
|
||||
batch,
|
||||
max_seqlen_q,
|
||||
-1, // seqlen will be updated by another pointer
|
||||
-1, //
|
||||
-1, // seqlen_q will be updated by another pointer
|
||||
-1, // seqlen_k will be updated by another pointer
|
||||
hdim_q,
|
||||
hdim_v,
|
||||
num_head_q,
|
||||
@@ -432,16 +412,18 @@ struct FmhaFwdSplitKVKernel
|
||||
nhead_stride_v,
|
||||
nhead_stride_lse_acc,
|
||||
nhead_stride_o_acc,
|
||||
batch_stride_lse_acc,
|
||||
batch_stride_o_acc,
|
||||
split_stride_lse_acc,
|
||||
split_stride_o_acc}, // args for common karg
|
||||
{}, // placeholder for bias
|
||||
{}, // placeholder for mask
|
||||
{}, // placeholder for fp8_static_quant args
|
||||
{}, // placeholder for dropout
|
||||
reinterpret_cast<const int32_t*>(seqstart_q_ptr),
|
||||
reinterpret_cast<const int32_t*>(seqstart_k_ptr),
|
||||
reinterpret_cast<const int32_t*>(seqlen_k_ptr)};
|
||||
reinterpret_cast<const int32_t*>(seqlen_k_ptr),
|
||||
batch_stride_k,
|
||||
batch_stride_v};
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
{
|
||||
@@ -464,14 +446,6 @@ struct FmhaFwdSplitKVKernel
|
||||
{
|
||||
kargs.scale_p = scale_p;
|
||||
}
|
||||
if constexpr(kHasDropout)
|
||||
{
|
||||
kargs.init_dropout(p_drop, drop_seed_offset);
|
||||
kargs.rand_val_ptr = rand_val_ptr;
|
||||
kargs.stride_randval = stride_randval;
|
||||
kargs.nhead_stride_randval = nhead_stride_randval;
|
||||
kargs.is_store_randval = s_randval;
|
||||
}
|
||||
|
||||
return kargs;
|
||||
}
|
||||
@@ -508,7 +482,6 @@ struct FmhaFwdSplitKVKernel
|
||||
long_index_t batch_offset_k = 0;
|
||||
long_index_t batch_offset_v = 0;
|
||||
long_index_t batch_offset_bias = 0;
|
||||
long_index_t batch_offset_randval = 0;
|
||||
long_index_t batch_offset_lse_acc = 0;
|
||||
const long_index_t batch_offset_o_acc =
|
||||
static_cast<long_index_t>(i_batch) * kargs.batch_stride_o_acc;
|
||||
@@ -534,14 +507,9 @@ struct FmhaFwdSplitKVKernel
|
||||
{
|
||||
batch_offset_bias = query_start * kargs.stride_bias + key_start;
|
||||
}
|
||||
if constexpr(kHasDropout)
|
||||
{
|
||||
batch_offset_randval = query_start * kargs.stride_randval;
|
||||
}
|
||||
|
||||
// get real # queries & # keys under group mode
|
||||
const auto adjusted_seqstart_q_ptr = kargs.seqstart_q_ptr + i_batch;
|
||||
kargs.seqlen_q = adjusted_seqstart_q_ptr[1] - adjusted_seqstart_q_ptr[0];
|
||||
kargs.seqlen_q = kargs.seqstart_q_ptr[i_batch + 1] - kargs.seqstart_q_ptr[i_batch];
|
||||
|
||||
// # of required blocks is different in each groups, terminate unnecessary blocks
|
||||
// earlier
|
||||
@@ -556,24 +524,36 @@ struct FmhaFwdSplitKVKernel
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto adjusted_seqstart_k_ptr = kargs.seqstart_k_ptr + i_batch;
|
||||
kargs.seqlen_k = adjusted_seqstart_k_ptr[1] - adjusted_seqstart_k_ptr[0];
|
||||
kargs.seqlen_k = kargs.seqstart_k_ptr[i_batch + 1] - kargs.seqstart_k_ptr[i_batch];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const index_t i_cache_batch = [&, i_batch_ = i_batch] {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return i_batch_;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (kargs.cache_batch_idx != nullptr ? kargs.cache_batch_idx[i_batch_]
|
||||
: i_batch_);
|
||||
}
|
||||
}();
|
||||
|
||||
batch_offset_q = static_cast<long_index_t>(i_batch) * kargs.batch_stride_q;
|
||||
batch_offset_k = static_cast<long_index_t>(i_batch) * kargs.batch_stride_k;
|
||||
batch_offset_v = static_cast<long_index_t>(i_batch) * kargs.batch_stride_v;
|
||||
batch_offset_k = static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_k;
|
||||
batch_offset_v = static_cast<long_index_t>(i_cache_batch) * kargs.batch_stride_v;
|
||||
batch_offset_lse_acc = static_cast<long_index_t>(i_batch) * kargs.batch_stride_lse_acc;
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
{
|
||||
batch_offset_bias = static_cast<long_index_t>(i_batch) * kargs.batch_stride_bias;
|
||||
}
|
||||
if constexpr(kHasDropout)
|
||||
|
||||
if(kargs.seqlen_k_ptr != nullptr)
|
||||
{
|
||||
batch_offset_randval =
|
||||
static_cast<long_index_t>(i_batch) * kargs.batch_stride_randval;
|
||||
kargs.seqlen_k = kargs.seqlen_k_ptr[i_batch];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -589,6 +569,7 @@ struct FmhaFwdSplitKVKernel
|
||||
reinterpret_cast<const VDataType*>(kargs.v_ptr) +
|
||||
static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
|
||||
batch_offset_v;
|
||||
|
||||
OaccDataType* o_acc_ptr = reinterpret_cast<OaccDataType*>(kargs.o_acc_ptr) +
|
||||
static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o_acc +
|
||||
batch_offset_o_acc + i_split * kargs.split_stride_o_acc;
|
||||
@@ -616,10 +597,11 @@ struct FmhaFwdSplitKVKernel
|
||||
sequence<kPadSeqLenQ, kPadHeadDimQ>{});
|
||||
}
|
||||
}();
|
||||
const auto k_dram = [&]() {
|
||||
|
||||
const auto make_k_dram = [&](const KDataType* data, index_t height) {
|
||||
const auto k_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
k_ptr,
|
||||
make_tuple(kargs.seqlen_k, kargs.hdim_q),
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(height, kargs.hdim_q),
|
||||
make_tuple(kargs.stride_k, 1),
|
||||
number<FmhaPipeline::kAlignmentK>{},
|
||||
number<1>{});
|
||||
@@ -628,13 +610,24 @@ struct FmhaFwdSplitKVKernel
|
||||
k_dram_naive,
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}),
|
||||
sequence<kPadSeqLenK, kPadHeadDimQ>{});
|
||||
};
|
||||
const auto k_dram = [&]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return make_k_dram(nullptr, kargs.page_block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_k_dram(k_ptr, kargs.seqlen_k);
|
||||
}
|
||||
}();
|
||||
const auto v_dram = [&]() {
|
||||
|
||||
const auto make_v_dram = [&](const VDataType* data, index_t length) {
|
||||
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
v_ptr,
|
||||
make_tuple(kargs.seqlen_k, kargs.hdim_v),
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(length, kargs.hdim_v),
|
||||
make_tuple(kargs.stride_v, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
@@ -642,7 +635,7 @@ struct FmhaFwdSplitKVKernel
|
||||
const auto v_dram_transposed =
|
||||
transform_tensor_view(v_dram_naive,
|
||||
make_tuple(make_pass_through_transform(kargs.hdim_v),
|
||||
make_pass_through_transform(kargs.seqlen_k)),
|
||||
make_pass_through_transform(length)),
|
||||
make_tuple(sequence<1>{}, sequence<0>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
|
||||
@@ -654,8 +647,8 @@ struct FmhaFwdSplitKVKernel
|
||||
else
|
||||
{
|
||||
const auto v_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
v_ptr,
|
||||
make_tuple(kargs.hdim_v, kargs.seqlen_k),
|
||||
data, // will update this pointer if using paged-kvcache
|
||||
make_tuple(kargs.hdim_v, length),
|
||||
make_tuple(kargs.stride_v, 1),
|
||||
number<FmhaPipeline::kAlignmentV>{},
|
||||
number<1>{});
|
||||
@@ -665,6 +658,76 @@ struct FmhaFwdSplitKVKernel
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
|
||||
sequence<kPadHeadDimV, kPadSeqLenK>{});
|
||||
}
|
||||
};
|
||||
const auto v_dram = [&]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
return make_v_dram(nullptr, kargs.page_block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_v_dram(v_ptr, kargs.seqlen_k);
|
||||
}
|
||||
}();
|
||||
|
||||
auto k_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
const auto* block_indices =
|
||||
reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
|
||||
i_batch_ * kargs.batch_stride_block_table;
|
||||
const index_t num_blocks =
|
||||
integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size);
|
||||
|
||||
const long_index_t fixed_offset =
|
||||
static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
|
||||
kargs.nhead_stride_k;
|
||||
|
||||
return make_page_block_navigator<const KDataType, 0>(
|
||||
kargs.k_ptr,
|
||||
kargs.batch_stride_k,
|
||||
fixed_offset,
|
||||
block_indices,
|
||||
num_blocks,
|
||||
kargs.page_block_size,
|
||||
k_dram,
|
||||
make_k_dram(nullptr,
|
||||
kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_page_block_navigator(k_dram);
|
||||
}
|
||||
}();
|
||||
|
||||
auto v_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
|
||||
if constexpr(kIsPagedKV)
|
||||
{
|
||||
const auto* block_indices =
|
||||
reinterpret_cast<const int32_t*>(kargs.block_table_ptr) +
|
||||
i_batch_ * kargs.batch_stride_block_table;
|
||||
const index_t num_blocks =
|
||||
integer_divide_ceil(kargs.seqlen_k, kargs.page_block_size);
|
||||
|
||||
const long_index_t fixed_offset =
|
||||
static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
|
||||
kargs.nhead_stride_v;
|
||||
|
||||
return make_page_block_navigator<const VDataType, 1>(
|
||||
kargs.v_ptr,
|
||||
kargs.batch_stride_v,
|
||||
fixed_offset,
|
||||
block_indices,
|
||||
num_blocks,
|
||||
kargs.page_block_size,
|
||||
v_dram,
|
||||
make_v_dram(nullptr,
|
||||
kargs.seqlen_k - (num_blocks - 1) * kargs.page_block_size));
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_page_block_navigator(v_dram);
|
||||
}
|
||||
}();
|
||||
|
||||
auto q_dram_window = make_tile_window(
|
||||
@@ -678,13 +741,11 @@ struct FmhaFwdSplitKVKernel
|
||||
}(),
|
||||
{i_m0, 0});
|
||||
|
||||
auto k_dram_window = make_tile_window(
|
||||
k_dram, make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{}), {0, 0});
|
||||
auto k_dram_window_lengths =
|
||||
make_tuple(number<FmhaPipeline::kN0>{}, number<FmhaPipeline::kK0>{});
|
||||
auto v_dram_window_lengths =
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{});
|
||||
|
||||
auto v_dram_window =
|
||||
make_tile_window(v_dram,
|
||||
make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
|
||||
{i_n1, 0});
|
||||
/// FIXME: Before C++20, capturing structured binding variables are not supported. Remove
|
||||
/// following copy capture of the 'i_nhead' if in C++20
|
||||
const auto bias_dram_window = [&, i_nhead_ = i_nhead]() {
|
||||
@@ -741,62 +802,6 @@ struct FmhaFwdSplitKVKernel
|
||||
return make_tile_window(lse_acc_dram, lse_acc_dram_window_lengths, {i_m0});
|
||||
}();
|
||||
|
||||
// dropout
|
||||
float rp_undrop = 1;
|
||||
uint8_t p_undrop_in_uint8_t = std::numeric_limits<uint8_t>::max();
|
||||
uint64_t drop_seed = 0;
|
||||
uint64_t drop_offset = 0;
|
||||
bool is_store_randval = false;
|
||||
|
||||
if constexpr(kHasDropout)
|
||||
{
|
||||
rp_undrop = kargs.rp_undrop;
|
||||
p_undrop_in_uint8_t = kargs.p_undrop_in_uint8_t;
|
||||
drop_seed = kargs.drop_seed;
|
||||
drop_offset = kargs.drop_offset;
|
||||
is_store_randval = kargs.is_store_randval;
|
||||
}
|
||||
BlockDropout dropout(i_batch,
|
||||
i_nhead,
|
||||
kargs.num_head_q,
|
||||
drop_seed,
|
||||
drop_offset,
|
||||
rp_undrop,
|
||||
p_undrop_in_uint8_t,
|
||||
is_store_randval);
|
||||
|
||||
auto randval_dram_window = [&, i_nhead_ = i_nhead]() {
|
||||
constexpr auto randval_dram_window_lengths =
|
||||
make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kN0>{});
|
||||
if constexpr(kHasDropout)
|
||||
{
|
||||
RandValOutputDataType* rand_val_ptr =
|
||||
reinterpret_cast<RandValOutputDataType*>(kargs.rand_val_ptr) +
|
||||
static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_randval +
|
||||
batch_offset_randval;
|
||||
|
||||
const auto randval_dram = [&]() {
|
||||
const auto randval_dram_naive =
|
||||
make_naive_tensor_view<address_space_enum::global>(
|
||||
rand_val_ptr,
|
||||
make_tuple(kargs.seqlen_q, kargs.seqlen_k),
|
||||
make_tuple(kargs.stride_randval, 1),
|
||||
number<1>{},
|
||||
number<1>{});
|
||||
|
||||
return pad_tensor_view(randval_dram_naive,
|
||||
randval_dram_window_lengths,
|
||||
sequence<kPadSeqLenQ, kPadSeqLenK>{});
|
||||
}();
|
||||
|
||||
return make_tile_window(randval_dram, randval_dram_window_lengths, {i_m0, 0});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_null_tile_window(randval_dram_window_lengths);
|
||||
}
|
||||
}();
|
||||
|
||||
FmhaMask mask = [&]() {
|
||||
if constexpr(kHasMask)
|
||||
return ck_tile::make_generic_attention_mask_from_lr_window<FmhaMask>(
|
||||
@@ -823,16 +828,16 @@ struct FmhaFwdSplitKVKernel
|
||||
#endif
|
||||
if constexpr(kHasMask)
|
||||
{
|
||||
return make_alibi_from_lr_mask<SaccDataType, true>(slope,
|
||||
kargs.window_size_left,
|
||||
kargs.window_size_right,
|
||||
kargs.seqlen_q,
|
||||
kargs.seqlen_k,
|
||||
kargs.mask_type);
|
||||
return make_alibi_from_lr_mask<SaccDataType, true, 32>(slope,
|
||||
kargs.window_size_left,
|
||||
kargs.window_size_right,
|
||||
kargs.seqlen_q,
|
||||
kargs.seqlen_k,
|
||||
kargs.mask_type);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Alibi<SaccDataType, true>{
|
||||
return Alibi<SaccDataType, true, 32>{
|
||||
slope, kargs.seqlen_q, kargs.seqlen_k, AlibiMode::FROM_BOTTOM_RIGHT};
|
||||
}
|
||||
}
|
||||
@@ -847,13 +852,14 @@ struct FmhaFwdSplitKVKernel
|
||||
{
|
||||
return FmhaPipeline{}(q_dram_window,
|
||||
identity{}, // q_element_func
|
||||
k_dram_window,
|
||||
k_dram_window_lengths,
|
||||
k_page_block_navigator,
|
||||
identity{}, // k_element_func
|
||||
v_dram_window,
|
||||
v_dram_window_lengths,
|
||||
v_page_block_navigator,
|
||||
identity{}, // v_element_func
|
||||
bias_dram_window,
|
||||
identity{}, // bias_element_func
|
||||
randval_dram_window,
|
||||
lse_acc_dram_window,
|
||||
identity{}, // lse_element_func
|
||||
identity{}, // s_acc_element_func
|
||||
@@ -864,24 +870,23 @@ struct FmhaFwdSplitKVKernel
|
||||
mask,
|
||||
position_encoding,
|
||||
kargs.scale_s,
|
||||
smem_ptr,
|
||||
dropout);
|
||||
smem_ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
return FmhaPipeline{}(q_dram_window,
|
||||
k_dram_window,
|
||||
v_dram_window,
|
||||
k_dram_window_lengths,
|
||||
k_page_block_navigator,
|
||||
v_dram_window_lengths,
|
||||
v_page_block_navigator,
|
||||
bias_dram_window,
|
||||
randval_dram_window,
|
||||
lse_acc_dram_window,
|
||||
kargs.num_splits,
|
||||
i_split_,
|
||||
mask,
|
||||
position_encoding,
|
||||
kargs.scale_s,
|
||||
smem_ptr,
|
||||
dropout);
|
||||
smem_ptr);
|
||||
}
|
||||
}();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user