mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
* Use dictionary to config all the functions * Add init codegen logic for fmha fwd appendkv * Call HIP_CHECK_ERROR() macro to get real source info * Setup meaningfull arguments * Sync kernel name with the codegen * Add knew/vnew tensors to the kernel argument * Fix wrong K values after appending * Fix vnew append errro * Extract common logics * Fix Vnew tile dstr for row major case * Conditionally add fwd_splitkv API in fmha_fwd example * Conditionally add call to fmha_fwd_splitkv() * Remove "EXAMPLE_" prefix of cmake variables * Regsiter API handlers automatically * Early return if 0 < s_k_new is not supported * Show message if we are ignoring option * Unify CMakeLists.txt coding style * Set num_splits=1 if split-kv is not supported * Add length/stride getters for HostTensor * Add RoPE example utilities * Add reference_rotary_position_embedding() (not implemented) * Finish reference_rotary_position_embedding() impl * Fix typo of HostTensor<>::get_length() * Fix compilation errors * Fix wrong answer when interleaved=false * Fix wrong answer when interleaved=true * Append K/V in the host verification code * Simplify K appending logics * Simplify v_host_ref definition * Reduce input/output dimensions * Rename function: add "batched" prefix * Apply RoPE on host side * Rename RoPE utility function * Fix wrong tensor size * Avoid invoking deprecated method 'find_module' * Pass RoPE kernel args * Create Rotary Cos/Sin tile windows in kernel * Add compute data type alias for RoPE * Randomly generate seqlen_knew if needed * Fix seqlen_knew enabling check logic * Add minimum seqlen_k to generate compliance kvcache * Fix compilation error in debug mode * Fix wrong boundaries * Fix wrong seqlen_k for kvcache * Rename variables used in distributio encoding * Fix rotary cos/sin tensor/tile size * Add constraint to the rotary_dim option * Remove unused inner namespace * Add dram distribution for rotary_cos/rotary_sin (interleaved) * Only apply interleaved RoPE on Knew for now * Fix wrong thread starting offset * Instantiate multiple kernels for RoPE approaches * Clean-up pipeline * Fix error in RoPE host reference * Handle RoPE half-rotated logics * Support 8x rotary_dim under half-rotated RoPE * Add comment * Apply elementwise function to the loaded tiles * Unify parameter/variable naming style * Remove constness from q_ptr * Add code blocks for q_tile * Apply RoPE to q_tile * Remove debug print code in kernel * Fix wrong knew/vnew appending positions * Use better naming for tile indices * Add make_tile_window() for adding distribution only * Skip code if # of block is more than needed * Move thread locating logics into policy * Remove always true static_assert() * Rename header * Rename RotaryEmbeddingEnum * Extract rotary embedding logic out * Re-order parameters * Align naming of some tile size constants * Rename more tile size constants * Fix wrong grid size * Fix wrong shape of knew_host/vnew_host * Fix wrong index into knew_host/vnew_host * Fix wrong rotary_cos/rotary_sin memory size for Q * Extract Q/Knew vector size to helper methods * Use different rotary_cos/rotary_sin distr for Q/Knew * Update host/device specifiers * Fix wrong data type for Q rotary_cos/rotary_sin * Remove RoPEComputeDataType type alias * Shift rotary_cos/rotary_sin by cache_seqlen_k * Add comment for why I just 't' for all padding flags * Align commit message to the real comment * Fix wrong pipeline * Rename utility function * Disable host verification if API not exist * Fix wrong rope key for fp8 pipeline * Allow only apply RoPE on Q (without append KV) * Add append-kv smoke tests * Remove debug statements * Remove more debug statements * Re-arrange the 'set +x' command * Remove no-longer used method in pipeline * Add missing init code * Refine pipeline padding settings * Enlarge rotary_dim limit (8 -> 16) * Enlarge KPerThread for rotary_interleaved=false * Update rotary_dim range in smoke_test_fwd.sh * Add template argument 'kIsPagedKV' for splitkv kernels * Launch splitkv kernel if given page_block_size * Fix wrong kernel name * Fix seqlen_k_min for pre-fill case (1 -> 0) * Add copy_const<> type trait * Add another make_tile_window() * Introduce 'TileWindowNavigator' types * Simplify TileWindowNavigator interfaces * Fix tile window navigation bugs * Disable calling fmha_fwd() * Remove ununnecessary data members * Simplify more make_tile_window() overloads * Move V tile through TileWindowNavigator * Fix uneven split checking logic * Move code after decide seqlen_q/seqlen_k * Make sure we always start reading complete tile * Use 128 as minimus page_block_size * Fix wrong origin for bias * Add batch_stride_k/batch_stride_v in group mode * Unify origin * Add missing kernel arguments for group mode * Add paged-kv codegen logic for appendkv kernels * Add block_table kernel args for appendkv kernel * Add tile navigators to the appendkv kernel * Fix wrong tensor descriptor lengths * Pass re-created tile window to pipeline * Fix wrong strides for appendkv kernel * Allow transit tile_window to another page-block * Handle cross-page-block write * Donot perform write again if already in last page-block * Always add fmha_fwd() api * Add missing group mode argument * Remove debug macro usages * Rename option s_k_new to s_knew * Separate splitkv/non-splitkv args/traits * Remove fmha_fwd_dispatch() * Fix compilation errors * Remove dropout code in splitkv kernel * Allow problem types without define kHasDropout attr * Use generic lambda to init traits objects * Separate more non-splitkv & splitkv traits/args * Display more info for specific kernels * Show more detailed warning message * Rename 'max_num_blocks' to 'max_num_page_blocks' * Remove no-longer used pipeline files * Wrap code by #if directives * Move functors to the begining of validation code * Use generic lambda to init all the api traits/args * Fix wrong seqlen for kvcache * Add missing comment * Rename TileWindowNavigator to PageBlockNavigator * Only expose necessary methods (not attributes) * Re-order pipeline paremeters * Refine smoke_test_fwd.sh * Fix wrong arugment count * Make tile window directly via PageBlockNavigator * Remove unused template paremeter * Remove group mode from appendkv kernel * Fix skcheck logic * Fix wrong syntax in skcheck expr * Use meaningful options in smoke test * Remove options * Fix formatting * Fix more format * Re-organize bash functions * Pass cache_batch_idx to kernels * Support cache_batch_idx in example * Fix compilation error * Add more appendkv test * Add more case for appendkv * Fix unexisted attribute * Remove 0 < seqlen_knew constraint * Clarify the case in warning message * Remove macro checking * Force batch mode when invoking appendkv & splitkv apis * Fix mode overriding logics * Fix wrong parameter name * Randomize seqlen_k if use kvcache * Use randomized seqlen_k for kvcache * Avoid using too small rotary_cos & rotary_sin * Rename parameter * Add seqlen_q & seqlen_k rules * Add comment * Add more comments * Fix compilation errors * Fix typo in comment * Remove type argument * Avoid seqlen_k=0 for kvcache * Revert "Avoid seqlen_k=0 for kvcache" This reverts commit21c4df89e4. * Fix wrong uneven split checking logics * Only randomize kvcache seqlen_k if 1 < batch * Return earlier if split is empty * Revert "Only randomize kvcache seqlen_k if 1 < batch" This reverts commitb9a4ab0d7e. * Re-order seqlen_k_start adjustment logics * Fix compilation errors * Re-format script * Find executable from folder automatically * Fix kvcache seqlen_k generating logic * Make comment more clear * Fix wrong knew/vew appending logic on host * Add s_barrier to sync threads * Revert "Add s_barrier to sync threads" This reverts commitd3f550f30c. * Support only using 1 row of rotary_cos/rotary_sin * Rotate Q in different way * Unify tensor view creation logics * Fix wrong argument * Add mask to switch how we use the rotary_cos/sin * Move attr from traits to problem * Move has_mask to fmha_fwd_appendkv_args * Support use uint32_t as SAD operand in Alibi<> * Use sad_u32() in splitkv kernels * Store tensor views in PageBlockNavigator * Use stored tensor view to update tile windows * Enlarge tensor view size * Remove debug code * Fix wrong tensor view size * Wrap tensor view into PageBlockNavigator * Add DataType member to PageBlockNavigator * Remove unnecessary member functions * Refind macro use * Fix typo * Add blank line between directives and actual code * Re-format files * Remove type in comment --------- Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com>
256 lines
8.7 KiB
C++
256 lines
8.7 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <functional>
|
|
#include <optional>
|
|
#include <ostream>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "ck_tile/core/container/span.hpp"
|
|
|
|
enum class mode_enum
|
|
{
|
|
batch = 0,
|
|
group
|
|
};
|
|
|
|
std::ostream& operator<<(std::ostream& stream, mode_enum mode)
|
|
{
|
|
return stream << (mode == mode_enum::batch ? "batch" : "group");
|
|
}
|
|
|
|
std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
|
|
{
|
|
std::vector<int32_t> seqstarts = {0};
|
|
for(int32_t seqlen : seqlens)
|
|
{
|
|
seqstarts.push_back(seqstarts.back() + seqlen);
|
|
}
|
|
assert(seqstarts.size() == seqlens.size() + 1);
|
|
return seqstarts;
|
|
}
|
|
|
|
std::vector<int32_t> generate_seqlens(unsigned count,
|
|
int32_t seqlen_avg,
|
|
int32_t seqlen_min = -1, // if not negative, clamp min
|
|
int32_t seqlen_max = -1, // if not negative, clamp max
|
|
std::optional<unsigned> seed = std::nullopt)
|
|
{
|
|
assert(0 < count);
|
|
|
|
seqlen_min = (0 < seqlen_min ? seqlen_min : 1);
|
|
seqlen_max = (0 < seqlen_max ? seqlen_max : std::numeric_limits<int32_t>::max());
|
|
assert(seqlen_min <= seqlen_max);
|
|
|
|
std::vector<int32_t> seqlens(count, std::clamp(seqlen_avg, seqlen_min, seqlen_max));
|
|
|
|
if(1 < count)
|
|
{
|
|
using size_type = std::vector<int32_t>::size_type;
|
|
|
|
std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
|
|
std::uniform_int_distribution<size_type> idx_dist(0, count - 1);
|
|
auto next_idx = std::bind(idx_dist, std::ref(random_engine));
|
|
|
|
std::uniform_int_distribution<size_type> step_dist(1, count - 1);
|
|
auto next_step = std::bind(step_dist, std::ref(random_engine));
|
|
|
|
for(unsigned repeat = seqlen_avg * (count / 2); 0 < repeat; --repeat)
|
|
{
|
|
const size_type to_decrease = next_idx();
|
|
// make sure each elements of seqlens is always greater than seqlen_min
|
|
if(seqlens[to_decrease] == seqlen_min)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
const size_type to_increase = (to_decrease + next_step()) % count;
|
|
|
|
if(seqlens[to_increase] >= seqlen_max)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
--seqlens[to_decrease];
|
|
++seqlens[to_increase];
|
|
}
|
|
}
|
|
|
|
return seqlens;
|
|
}
|
|
|
|
// return random integer generated uniformly in range [low, high]
|
|
template <typename Int = int>
|
|
auto randint(Int low, Int high, std::optional<unsigned> seed = std::nullopt)
|
|
-> std::enable_if_t<std::is_integral_v<Int>, Int>
|
|
{
|
|
std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
|
|
std::uniform_int_distribution<Int> dist(low, high);
|
|
return dist(engine);
|
|
}
|
|
|
|
// return random integers generated uniformly in range [low, high]
|
|
template <typename Int, typename ForwardIterator>
|
|
auto randints(ForwardIterator first,
|
|
ForwardIterator last,
|
|
Int low,
|
|
Int high,
|
|
std::optional<unsigned> seed = std::nullopt)
|
|
-> std::enable_if_t<std::is_integral_v<Int>>
|
|
{
|
|
std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
|
|
std::uniform_int_distribution<Int> dist(low, high);
|
|
|
|
std::generate(first, last, [&] { return dist(engine); });
|
|
}
|
|
|
|
/*
|
|
* decode the seqlen string from cmdline
|
|
* example (assume batch=3)
|
|
* q_val=1,2,3 k_val=4,5,6 -> OK
|
|
* q_val=1,2,3 -> OK, k same as q
|
|
* q_val=1,2 -> OK, q will rand remaining 1 element, k same as q
|
|
* q_val=1,2 k_val=4,5 -> OK, q/k will rand remaining 1 element
|
|
* q_val=1,2,3,4 -> OK, but ignore exceed one
|
|
*
|
|
* q_val=1,2 k_val=4,5,6 -> not OK, k must have same splits with q
|
|
* q_val=1,2 k_val=4 -> not OK, k must have same splits with q
|
|
*/
|
|
std::tuple<std::vector<ck_tile::index_t>,
|
|
std::vector<ck_tile::index_t>,
|
|
std::vector<ck_tile::index_t>>
|
|
decode_seqlen(mode_enum mode,
|
|
ck_tile::index_t batch,
|
|
std::string q_val,
|
|
std::string k_val,
|
|
std::string k_pad_val,
|
|
ck_tile::index_t seqlen_k_min = 0,
|
|
bool use_kvcache = false,
|
|
std::optional<unsigned> seed = std::nullopt)
|
|
{
|
|
#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
|
|
if(mode == mode_enum::batch)
|
|
{
|
|
ck_tile::index_t q = _S2I_(q_val);
|
|
ck_tile::index_t k = _S2I_(k_val);
|
|
|
|
auto s_q = std::vector<ck_tile::index_t>(batch, q);
|
|
auto s_k = [&] {
|
|
const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k);
|
|
std::vector<ck_tile::index_t> seqlen_ks(batch, seqlen_k_max);
|
|
|
|
if(1 < batch && use_kvcache)
|
|
{
|
|
// to keep the original s_k value, we always use seqlen_k_max in first batch
|
|
randints(std::next(seqlen_ks.begin()),
|
|
seqlen_ks.end(),
|
|
seqlen_k_min,
|
|
seqlen_k_max,
|
|
seed);
|
|
return seqlen_ks;
|
|
}
|
|
|
|
return seqlen_ks;
|
|
}();
|
|
auto s_kpad = std::vector<ck_tile::index_t>(batch, -1); // TODO: batch not support k_padding
|
|
|
|
// s_k should be greater than or equal to seqlen_k_min if provided
|
|
if(s_k.back() < seqlen_k_min)
|
|
{
|
|
std::ostringstream msg;
|
|
msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
|
|
<< ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
|
|
throw std::runtime_error(msg.str());
|
|
}
|
|
|
|
return std::make_tuple(s_q, s_k, s_kpad);
|
|
}
|
|
else
|
|
{
|
|
ck_tile::index_t idx = 0;
|
|
std::string::size_type pos_q = 0;
|
|
std::string::size_type pos_k = 0;
|
|
std::string::size_type pos_kp = 0;
|
|
std::vector<ck_tile::index_t> s_q;
|
|
std::vector<ck_tile::index_t> s_k;
|
|
std::vector<ck_tile::index_t> s_kpad;
|
|
while(true)
|
|
{
|
|
auto found_q = q_val.find(',', pos_q);
|
|
auto found_k = k_val.find(',', pos_k);
|
|
auto found_kp = k_pad_val.find(',', pos_kp);
|
|
|
|
ck_tile::index_t q = _S2I_(
|
|
q_val.substr(pos_q, found_q == std::string::npos ? found_q : found_q - pos_q));
|
|
ck_tile::index_t k = _S2I_(
|
|
k_val.substr(pos_k, found_k == std::string::npos ? found_k : found_k - pos_k));
|
|
ck_tile::index_t kp = _S2I_(k_pad_val.substr(
|
|
pos_kp, found_kp == std::string::npos ? found_kp : found_kp - pos_kp));
|
|
|
|
s_q.push_back(q);
|
|
s_k.push_back(k < 0 ? q : k);
|
|
s_kpad.push_back(kp);
|
|
|
|
// s_k should be greater than or equal to seqlen_k_min
|
|
if(s_k.back() < seqlen_k_min)
|
|
{
|
|
std::ostringstream msg;
|
|
msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
|
|
<< ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
|
|
throw std::runtime_error(msg.str());
|
|
}
|
|
|
|
idx++;
|
|
if(found_q == std::string::npos || idx >= batch)
|
|
{
|
|
break;
|
|
}
|
|
pos_q = found_q + 1;
|
|
pos_k = found_k == std::string::npos ? pos_k : found_k + 1;
|
|
pos_kp = found_kp == std::string::npos ? pos_kp : found_kp + 1;
|
|
}
|
|
if(idx < batch)
|
|
{
|
|
auto rem_q = generate_seqlens(batch - idx, s_q.back(), 1, s_kpad.back(), seed);
|
|
auto rem_k =
|
|
generate_seqlens(batch - idx, s_k.back(), seqlen_k_min, s_kpad.back(), seed);
|
|
|
|
s_q.insert(s_q.end(), rem_q.begin(), rem_q.end());
|
|
s_k.insert(s_k.end(), rem_k.begin(), rem_k.end());
|
|
s_kpad.insert(s_kpad.end(), batch - idx, s_kpad.back());
|
|
}
|
|
return std::make_tuple(s_q, s_k, s_kpad);
|
|
}
|
|
#undef _S2I_
|
|
}
|
|
|
|
int env_get_int(const char* var_name, int default_int)
|
|
{
|
|
char* v = getenv(var_name);
|
|
int r = default_int;
|
|
if(v)
|
|
r = std::atoi(v);
|
|
return r;
|
|
}
|
|
|
|
template <typename RandomAccessIterator, typename Int>
|
|
std::enable_if_t<std::is_integral_v<Int>> iota_shuffle(RandomAccessIterator first,
|
|
RandomAccessIterator last,
|
|
Int value,
|
|
std::optional<unsigned> seed = std::nullopt)
|
|
{
|
|
std::iota(first, last, value);
|
|
|
|
std::mt19937 engine(seed.has_value() ? *seed : std::random_device{}());
|
|
std::shuffle(first, last, engine);
|
|
}
|