mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
This reverts commit 2cbbf5dcb3.
This commit is contained in:
@@ -169,27 +169,27 @@ struct BatchedGemmKernel
|
||||
CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
|
||||
{
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const auto i_batch = amd_wave_read_first_lane(blockIdx.y);
|
||||
const auto i_splitk = amd_wave_read_first_lane(blockIdx.z);
|
||||
const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.y);
|
||||
const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z);
|
||||
|
||||
const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
|
||||
|
||||
// options
|
||||
const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A);
|
||||
const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A);
|
||||
const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
|
||||
const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
|
||||
const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + batch_offset_A +
|
||||
splitk_batch_offset.as_k_split_offset[0];
|
||||
|
||||
const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B);
|
||||
const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B);
|
||||
const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
|
||||
const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
|
||||
const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + batch_offset_B +
|
||||
splitk_batch_offset.bs_k_split_offset[0];
|
||||
|
||||
const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E);
|
||||
const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E);
|
||||
const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
|
||||
const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
|
||||
CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr) + batch_offset_C;
|
||||
|
||||
// allocate LDS
|
||||
|
||||
@@ -73,8 +73,8 @@ struct GemmTile2DPartitioner
|
||||
CK_TILE_DEVICE static auto
|
||||
GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple<index_t, index_t>
|
||||
{
|
||||
const index_t iM = amd_wave_read_first_lane(blockIdx);
|
||||
const index_t iN = amd_wave_read_first_lane(blockIdy);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy);
|
||||
return make_tuple(iM, iN);
|
||||
}
|
||||
};
|
||||
@@ -143,8 +143,8 @@ struct GemmTile1DPartitioner
|
||||
{
|
||||
const index_t NBlocks = integer_divide_ceil(N_, NPerBlock);
|
||||
|
||||
const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks);
|
||||
const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks);
|
||||
return make_tuple(iM, iN);
|
||||
}
|
||||
|
||||
|
||||
@@ -272,8 +272,8 @@ struct GroupedGemmKernel
|
||||
|
||||
const auto [iM, iN] = block_idx_2d;
|
||||
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
|
||||
|
||||
@@ -358,8 +358,8 @@ struct GroupedGemmKernel
|
||||
const auto& d_block_window = gemm_tile_windows.at(Base::I2);
|
||||
|
||||
// Get hot-loop and tail configuration
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
|
||||
const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
@@ -416,8 +416,8 @@ struct GroupedGemmKernel
|
||||
const auto& d_block_window = gemm_tile_windows.at(Base::I2);
|
||||
|
||||
// Get hot-loop and tail configuration
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
// Run GEMM pipeline with compile-time branching
|
||||
|
||||
@@ -271,8 +271,8 @@ struct StreamKKernel
|
||||
uint32_t block_idx = ck_tile::get_block_1d_id();
|
||||
|
||||
bool is_padding_block =
|
||||
amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
|
||||
block_idx < kargs.tile_partitioner.dp_start_block_idx);
|
||||
__builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
|
||||
block_idx < kargs.tile_partitioner.dp_start_block_idx);
|
||||
|
||||
// Padding blocks make it such that the DP blocks are aligned with the number of CUs; they
|
||||
// should not partake in the GEMM
|
||||
@@ -289,7 +289,7 @@ struct StreamKKernel
|
||||
{
|
||||
// Determine the number of macro tiles in A and B this WG is resposible for in the
|
||||
// current C macro tile.
|
||||
uint32_t current_iter_length = amd_wave_read_first_lane(
|
||||
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
|
||||
kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end));
|
||||
|
||||
// Determine the 1D tile_idx and the iter_offset for this WG.
|
||||
|
||||
@@ -326,19 +326,19 @@ struct UniversalGemmKernel
|
||||
__device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
|
||||
{
|
||||
constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
|
||||
const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1);
|
||||
const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
|
||||
const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
|
||||
const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
|
||||
|
||||
static_for<0, NumATensor, 1>{}([&](auto index) {
|
||||
using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
|
||||
{
|
||||
as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
|
||||
as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
|
||||
{
|
||||
as_k_split_offset[index] =
|
||||
amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);
|
||||
__builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -347,21 +347,21 @@ struct UniversalGemmKernel
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
|
||||
{
|
||||
bs_k_split_offset[index] =
|
||||
amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);
|
||||
__builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
|
||||
{
|
||||
bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
|
||||
bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
});
|
||||
|
||||
if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(KRead);
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(KRead);
|
||||
}
|
||||
else
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -970,8 +970,8 @@ struct UniversalGemmKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& as_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -1026,8 +1026,8 @@ struct UniversalGemmKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& as_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -1052,10 +1052,10 @@ struct UniversalGemmKernel
|
||||
template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
|
||||
CK_TILE_DEVICE void operator()(KernelArgs kargs) const
|
||||
{
|
||||
const auto blockId = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs);
|
||||
|
||||
@@ -1126,22 +1126,22 @@ struct UniversalGemmKernel
|
||||
template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
|
||||
CK_TILE_DEVICE void operator()(KernelArgs kargs) const
|
||||
{
|
||||
const auto grid_size = amd_wave_read_first_lane(get_grid_size());
|
||||
const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
|
||||
const auto num_tiles =
|
||||
amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
|
||||
const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch);
|
||||
auto block_id = amd_wave_read_first_lane(get_block_id());
|
||||
__builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
|
||||
const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
|
||||
auto block_id = __builtin_amdgcn_readfirstlane(get_block_id());
|
||||
|
||||
while(block_id < num_work)
|
||||
{
|
||||
// Get the tile index for this block
|
||||
const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
|
||||
const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
// Get the SplitK offset for this block
|
||||
const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles);
|
||||
const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
|
||||
|
||||
std::array<const ADataType*, NumATensor> as_ptr;
|
||||
|
||||
Reference in New Issue
Block a user