mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
Fix a bug for qr_ks_vs_async_trload pipeline (#3271)
This commit is contained in:
@@ -1641,7 +1641,7 @@ struct FmhaFwdKernel
|
||||
// 2. use more LDS, as we want better memory latency hiding
|
||||
// If SplitKV off, we don't expect Q data reused by different ThreadGroups, bypass the
|
||||
// cache
|
||||
constexpr bool PrefillCase = FmhaPipeline::kM0 >= 128;
|
||||
constexpr bool PrefillCase = FmhaPipeline::kM0 > 64;
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ struct BlockFmhaPipelineQRKSVSAsyncTrload
|
||||
using VLayout = remove_cvref_t<typename BlockFmhaShape::VLayout>;
|
||||
static constexpr bool kQLoadOnce = true; // if q_tile load whole block length (hdim) at once
|
||||
static_assert(kQLoadOnce == Policy::QLoadOnce);
|
||||
static constexpr bool kKLoadOnce = BlockFmhaShape::kM0 >= 64;
|
||||
static constexpr bool kKLoadOnce = BlockFmhaShape::kM0 > 64;
|
||||
|
||||
static constexpr index_t kBlockSize = Problem::kBlockSize;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user