mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-01 12:11:19 +00:00
This reverts commit 2cbbf5dcb3.
This commit is contained in:
@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_bias = 0;
|
||||
|
||||
@@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);
|
||||
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
@@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
|
||||
|
||||
long_index_t batch_offset_o = 0;
|
||||
long_index_t batch_offset_do = 0;
|
||||
@@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
|
||||
|
||||
long_index_t batch_offset_dq = 0;
|
||||
long_index_t batch_offset_dq_acc = 0;
|
||||
|
||||
@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
|
||||
const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
|
||||
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
|
||||
|
||||
const index_t i_cache_batch = [&, i_batch_ = i_batch] {
|
||||
if constexpr(kIsPagedKV)
|
||||
|
||||
@@ -1060,8 +1060,8 @@ struct FmhaFwdKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_lse_acc = 0;
|
||||
long_index_t batch_offset_o_acc = 0;
|
||||
|
||||
@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0; // unused for paged-kvcache
|
||||
|
||||
@@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
|
||||
k_block_tile = load_tile(k_dram_window);
|
||||
}
|
||||
auto physical_next_block_id_k =
|
||||
amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_k, k_dram_block_window, {kN0, 0}));
|
||||
auto physical_next_block_id_v = amd_wave_read_first_lane(
|
||||
auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
|
||||
v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
|
||||
@@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
|
||||
k_block_tile = load_tile(k_dram_window);
|
||||
}
|
||||
auto physical_next_block_id_k =
|
||||
amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_k, k_dram_block_window, {kN0, 0}));
|
||||
auto physical_next_block_id_v = amd_wave_read_first_lane(
|
||||
auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
|
||||
v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
@@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
|
||||
&i_page_block_v_ = i_page_block_v,
|
||||
&v_dram_window_ = v_dram_window](auto i_k1) {
|
||||
auto physical_next_block_id_v_ =
|
||||
amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_v_, v_dram_window_, {0, kK1}));
|
||||
const auto v = load_tile(v_dram_window_); // load next v
|
||||
block_sync_lds();
|
||||
|
||||
Reference in New Issue
Block a user