mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-01 20:21:23 +00:00
This reverts commit 2cbbf5dcb3.
This commit is contained in:
@@ -2829,60 +2829,6 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
}
|
||||
#endif
|
||||
|
||||
// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
|
||||
// memory to the SGPR registers.
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
|
||||
__device__ inline auto amd_wave_read_first_lane(const Object& obj)
|
||||
{
|
||||
constexpr size_t ObjectSize = sizeof(Object);
|
||||
constexpr size_t SGPR_size = 4;
|
||||
constexpr size_t NumFull = ObjectSize / SGPR_size;
|
||||
constexpr size_t Tail = ObjectSize % SGPR_size;
|
||||
|
||||
const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
|
||||
alignas(Object) unsigned char dst[ObjectSize];
|
||||
|
||||
static_for<0, NumFull, 1>{}([&](auto Ic) {
|
||||
constexpr size_t offset = Ic * SGPR_size;
|
||||
uint32_t read_src;
|
||||
__builtin_memcpy(&read_src, src + offset, SGPR_size);
|
||||
read_src = __builtin_amdgcn_readfirstlane(read_src);
|
||||
__builtin_memcpy(dst + offset, &read_src, SGPR_size);
|
||||
});
|
||||
|
||||
if constexpr(Tail != 0)
|
||||
{
|
||||
constexpr size_t offset = NumFull * SGPR_size;
|
||||
uint32_t tail_loc = 0;
|
||||
__builtin_memcpy(&tail_loc, src + offset, Tail);
|
||||
tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
|
||||
__builtin_memcpy(dst + offset, &tail_loc, Tail);
|
||||
}
|
||||
Object out;
|
||||
__builtin_memcpy(&out, dst, ObjectSize);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
#endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
|
||||
|
||||
@@ -2585,8 +2585,9 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
|
||||
|
||||
#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr =
|
||||
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
asm volatile("s_mov_b32 m0, %0; \n\t"
|
||||
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
|
||||
"v"(global_offset_bytes),
|
||||
@@ -2659,60 +2660,6 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
}
|
||||
#endif
|
||||
|
||||
// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
|
||||
// memory to the SGPR registers.
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
|
||||
__device__ inline auto amd_wave_read_first_lane(const Object& obj)
|
||||
{
|
||||
constexpr size_t ObjectSize = sizeof(Object);
|
||||
constexpr size_t SGPR_size = 4;
|
||||
constexpr size_t NumFull = ObjectSize / SGPR_size;
|
||||
constexpr size_t Tail = ObjectSize % SGPR_size;
|
||||
|
||||
const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
|
||||
alignas(Object) unsigned char dst[ObjectSize];
|
||||
|
||||
static_for<0, NumFull, 1>{}([&](auto Ic) {
|
||||
constexpr size_t offset = Ic * SGPR_size;
|
||||
uint32_t read_src;
|
||||
__builtin_memcpy(&read_src, src + offset, SGPR_size);
|
||||
read_src = __builtin_amdgcn_readfirstlane(read_src);
|
||||
__builtin_memcpy(dst + offset, &read_src, SGPR_size);
|
||||
});
|
||||
|
||||
if constexpr(Tail != 0)
|
||||
{
|
||||
constexpr size_t offset = NumFull * SGPR_size;
|
||||
uint32_t tail_loc = 0;
|
||||
__builtin_memcpy(&tail_loc, src + offset, Tail);
|
||||
tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
|
||||
__builtin_memcpy(dst + offset, &tail_loc, Tail);
|
||||
}
|
||||
Object out;
|
||||
__builtin_memcpy(&out, dst, ObjectSize);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
|
||||
|
||||
@@ -9,8 +9,6 @@
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/core/numeric/integral_constant.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
|
||||
#define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
|
||||
@@ -106,7 +104,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
|
||||
const index_t warp_id = threadIdx.x / get_warp_size();
|
||||
if constexpr(ReturnSgpr)
|
||||
{
|
||||
return amd_wave_read_first_lane(warp_id);
|
||||
return __builtin_amdgcn_readfirstlane(warp_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -402,7 +402,7 @@ struct tile_window_with_static_distribution
|
||||
const index_t m0_init_value =
|
||||
size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
|
||||
m0_set_with_memory(
|
||||
amd_wave_read_first_lane(m0_init_value)); // This should be wave independent
|
||||
__builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
|
||||
|
||||
using Traits = typename Base::Traits;
|
||||
|
||||
|
||||
@@ -92,13 +92,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
|
||||
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
index_t num_n_tile_iteration =
|
||||
amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
|
||||
__builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
|
||||
|
||||
using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(a_window)));
|
||||
auto square_sum = block_reduce2d.template MakeYBlockTile<XTensorType>();
|
||||
set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto a = load_tile(a_window);
|
||||
const auto b = load_tile(b_window);
|
||||
@@ -149,7 +149,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
|
||||
if constexpr(kSaveX)
|
||||
__syncthreads();
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto x = [&]() {
|
||||
if constexpr(kSaveX)
|
||||
@@ -226,7 +226,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
|
||||
}
|
||||
move_tile_window(gamma_window, {Block_N});
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto x = [&]() {
|
||||
if constexpr(kSaveX)
|
||||
|
||||
@@ -84,9 +84,9 @@ struct BatchedTransposeKernel
|
||||
static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput;
|
||||
static constexpr ck_tile::index_t VectorStrideOutput = 1;
|
||||
|
||||
const auto iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
|
||||
const auto iN = amd_wave_read_first_lane(blockIdx.y * kNPerBlock);
|
||||
const auto offset = amd_wave_read_first_lane(blockIdx.z * kargs.height * kargs.width);
|
||||
const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
|
||||
const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
|
||||
const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width);
|
||||
|
||||
const auto x_m_n = [&]() {
|
||||
const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
|
||||
|
||||
@@ -598,8 +598,8 @@ struct FlatmmKernel
|
||||
CK_TILE_DEVICE void operator()(KernelArgs kargs) const
|
||||
{
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs);
|
||||
// options
|
||||
|
||||
@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_bias = 0;
|
||||
|
||||
@@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);
|
||||
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
@@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
|
||||
|
||||
long_index_t batch_offset_o = 0;
|
||||
long_index_t batch_offset_do = 0;
|
||||
@@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
|
||||
|
||||
long_index_t batch_offset_dq = 0;
|
||||
long_index_t batch_offset_dq_acc = 0;
|
||||
|
||||
@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
|
||||
const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
|
||||
const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
|
||||
|
||||
const index_t i_cache_batch = [&, i_batch_ = i_batch] {
|
||||
if constexpr(kIsPagedKV)
|
||||
|
||||
@@ -1060,8 +1060,8 @@ struct FmhaFwdKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_lse_acc = 0;
|
||||
long_index_t batch_offset_o_acc = 0;
|
||||
|
||||
@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0; // unused for paged-kvcache
|
||||
|
||||
@@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel
|
||||
// divide problem
|
||||
const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
|
||||
|
||||
const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
|
||||
const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
|
||||
const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
|
||||
|
||||
long_index_t batch_offset_q = 0;
|
||||
long_index_t batch_offset_k = 0;
|
||||
|
||||
@@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
|
||||
k_block_tile = load_tile(k_dram_window);
|
||||
}
|
||||
auto physical_next_block_id_k =
|
||||
amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_k, k_dram_block_window, {kN0, 0}));
|
||||
auto physical_next_block_id_v = amd_wave_read_first_lane(
|
||||
auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
|
||||
v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
|
||||
@@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
|
||||
k_block_tile = load_tile(k_dram_window);
|
||||
}
|
||||
auto physical_next_block_id_k =
|
||||
amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_k, k_dram_block_window, {kN0, 0}));
|
||||
auto physical_next_block_id_v = amd_wave_read_first_lane(
|
||||
auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
|
||||
v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
|
||||
|
||||
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
|
||||
@@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
|
||||
&i_page_block_v_ = i_page_block_v,
|
||||
&v_dram_window_ = v_dram_window](auto i_k1) {
|
||||
auto physical_next_block_id_v_ =
|
||||
amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
|
||||
__builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
|
||||
i_page_block_v_, v_dram_window_, {0, kK1}));
|
||||
const auto v = load_tile(v_dram_window_); // load next v
|
||||
block_sync_lds();
|
||||
|
||||
@@ -240,7 +240,7 @@ struct FusedMoeGemmKernel
|
||||
if constexpr(UseUK)
|
||||
{
|
||||
__shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()];
|
||||
IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
|
||||
IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
|
||||
*reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
|
||||
|
||||
num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0;
|
||||
@@ -261,7 +261,7 @@ struct FusedMoeGemmKernel
|
||||
{
|
||||
// allocate LDS
|
||||
// __shared__ char smem_ptr[GetSmemSize()];
|
||||
IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
|
||||
IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
|
||||
*reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
|
||||
constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;
|
||||
|
||||
@@ -283,14 +283,14 @@ struct FusedMoeGemmKernel
|
||||
return;
|
||||
|
||||
const IndexDataType expert_id =
|
||||
amd_wave_read_first_lane(reinterpret_cast<const IndexDataType*>(
|
||||
__builtin_amdgcn_readfirstlane(reinterpret_cast<const IndexDataType*>(
|
||||
kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
|
||||
|
||||
// index along intermediate_size
|
||||
// index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id *
|
||||
// BlockShape::Block_N0);
|
||||
index_t interm_idx_nr =
|
||||
amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0);
|
||||
__builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0);
|
||||
|
||||
const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col]
|
||||
const auto sorted_token_id =
|
||||
|
||||
@@ -756,7 +756,7 @@ struct MoeSortingKernel
|
||||
void* smem) const
|
||||
{
|
||||
const index_t tid = static_cast<index_t>(threadIdx.x);
|
||||
const index_t wid = amd_wave_read_first_lane(tid / get_warp_size());
|
||||
const index_t wid = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
|
||||
const index_t lid = __lane_id();
|
||||
constexpr index_t block_size = 256; // blockDim.x;
|
||||
const index_t sub_tokens = smem_rows - 2; // sub_tokens_mdiv.divisor;
|
||||
|
||||
@@ -184,17 +184,17 @@ struct FusedMoeGemmPipeline_FlatmmUk
|
||||
index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1;
|
||||
index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1;
|
||||
|
||||
const IndexDataType expert_id = amd_wave_read_first_lane(
|
||||
const IndexDataType expert_id = __builtin_amdgcn_readfirstlane(
|
||||
reinterpret_cast<const IndexDataType*>(kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
|
||||
index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
|
||||
index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
|
||||
|
||||
// nr*kr*w
|
||||
index_t interm_idx_nr0 = amd_wave_read_first_lane(
|
||||
index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
|
||||
intermediate_tile_id *
|
||||
BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W)
|
||||
|
||||
index_t interm_idx_kr1 = amd_wave_read_first_lane(
|
||||
index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane(
|
||||
intermediate_tile_id *
|
||||
BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
|
||||
|
||||
|
||||
@@ -169,27 +169,27 @@ struct BatchedGemmKernel
|
||||
CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
|
||||
{
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const auto i_batch = amd_wave_read_first_lane(blockIdx.y);
|
||||
const auto i_splitk = amd_wave_read_first_lane(blockIdx.z);
|
||||
const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.y);
|
||||
const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z);
|
||||
|
||||
const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
|
||||
|
||||
// options
|
||||
const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A);
|
||||
const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A);
|
||||
const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
|
||||
const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
|
||||
const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + batch_offset_A +
|
||||
splitk_batch_offset.as_k_split_offset[0];
|
||||
|
||||
const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B);
|
||||
const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B);
|
||||
const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
|
||||
const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
|
||||
const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + batch_offset_B +
|
||||
splitk_batch_offset.bs_k_split_offset[0];
|
||||
|
||||
const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E);
|
||||
const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E);
|
||||
const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
|
||||
const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
|
||||
CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr) + batch_offset_C;
|
||||
|
||||
// allocate LDS
|
||||
|
||||
@@ -73,8 +73,8 @@ struct GemmTile2DPartitioner
|
||||
CK_TILE_DEVICE static auto
|
||||
GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple<index_t, index_t>
|
||||
{
|
||||
const index_t iM = amd_wave_read_first_lane(blockIdx);
|
||||
const index_t iN = amd_wave_read_first_lane(blockIdy);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy);
|
||||
return make_tuple(iM, iN);
|
||||
}
|
||||
};
|
||||
@@ -143,8 +143,8 @@ struct GemmTile1DPartitioner
|
||||
{
|
||||
const index_t NBlocks = integer_divide_ceil(N_, NPerBlock);
|
||||
|
||||
const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks);
|
||||
const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks);
|
||||
const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks);
|
||||
return make_tuple(iM, iN);
|
||||
}
|
||||
|
||||
|
||||
@@ -272,8 +272,8 @@ struct GroupedGemmKernel
|
||||
|
||||
const auto [iM, iN] = block_idx_2d;
|
||||
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
|
||||
|
||||
@@ -358,8 +358,8 @@ struct GroupedGemmKernel
|
||||
const auto& d_block_window = gemm_tile_windows.at(Base::I2);
|
||||
|
||||
// Get hot-loop and tail configuration
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
|
||||
const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
@@ -416,8 +416,8 @@ struct GroupedGemmKernel
|
||||
const auto& d_block_window = gemm_tile_windows.at(Base::I2);
|
||||
|
||||
// Get hot-loop and tail configuration
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
// Run GEMM pipeline with compile-time branching
|
||||
|
||||
@@ -271,8 +271,8 @@ struct StreamKKernel
|
||||
uint32_t block_idx = ck_tile::get_block_1d_id();
|
||||
|
||||
bool is_padding_block =
|
||||
amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
|
||||
block_idx < kargs.tile_partitioner.dp_start_block_idx);
|
||||
__builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
|
||||
block_idx < kargs.tile_partitioner.dp_start_block_idx);
|
||||
|
||||
// Padding blocks make it such that the DP blocks are aligned with the number of CUs; they
|
||||
// should not partake in the GEMM
|
||||
@@ -289,7 +289,7 @@ struct StreamKKernel
|
||||
{
|
||||
// Determine the number of macro tiles in A and B this WG is resposible for in the
|
||||
// current C macro tile.
|
||||
uint32_t current_iter_length = amd_wave_read_first_lane(
|
||||
uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
|
||||
kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end));
|
||||
|
||||
// Determine the 1D tile_idx and the iter_offset for this WG.
|
||||
|
||||
@@ -326,19 +326,19 @@ struct UniversalGemmKernel
|
||||
__device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
|
||||
{
|
||||
constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
|
||||
const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1);
|
||||
const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
|
||||
const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
|
||||
const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
|
||||
|
||||
static_for<0, NumATensor, 1>{}([&](auto index) {
|
||||
using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
|
||||
{
|
||||
as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
|
||||
as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
|
||||
{
|
||||
as_k_split_offset[index] =
|
||||
amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);
|
||||
__builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -347,21 +347,21 @@ struct UniversalGemmKernel
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
|
||||
{
|
||||
bs_k_split_offset[index] =
|
||||
amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);
|
||||
__builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
|
||||
{
|
||||
bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
|
||||
bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
});
|
||||
|
||||
if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(KRead);
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(KRead);
|
||||
}
|
||||
else
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -970,8 +970,8 @@ struct UniversalGemmKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& as_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -1026,8 +1026,8 @@ struct UniversalGemmKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& as_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -1052,10 +1052,10 @@ struct UniversalGemmKernel
|
||||
template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
|
||||
CK_TILE_DEVICE void operator()(KernelArgs kargs) const
|
||||
{
|
||||
const auto blockId = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs);
|
||||
|
||||
@@ -1126,22 +1126,22 @@ struct UniversalGemmKernel
|
||||
template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
|
||||
CK_TILE_DEVICE void operator()(KernelArgs kargs) const
|
||||
{
|
||||
const auto grid_size = amd_wave_read_first_lane(get_grid_size());
|
||||
const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
|
||||
const auto num_tiles =
|
||||
amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
|
||||
const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch);
|
||||
auto block_id = amd_wave_read_first_lane(get_block_id());
|
||||
__builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
|
||||
const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
|
||||
auto block_id = __builtin_amdgcn_readfirstlane(get_block_id());
|
||||
|
||||
while(block_id < num_work)
|
||||
{
|
||||
// Get the tile index for this block
|
||||
const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
|
||||
const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
// Get the SplitK offset for this block
|
||||
const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles);
|
||||
const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
|
||||
|
||||
std::array<const ADataType*, NumATensor> as_ptr;
|
||||
|
||||
@@ -487,7 +487,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
|
||||
if(HasHotLoop)
|
||||
{
|
||||
// minus 2 because we have ping-pong double buffer.
|
||||
index_t iCounter = amd_wave_read_first_lane(num_loop - 2);
|
||||
index_t iCounter = __builtin_amdgcn_readfirstlane(num_loop - 2);
|
||||
do
|
||||
{
|
||||
// ping
|
||||
|
||||
@@ -178,7 +178,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
|
||||
|
||||
index_t warp_id = get_warp_id();
|
||||
index_t operation_id =
|
||||
amd_wave_read_first_lane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
|
||||
__builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
|
||||
|
||||
auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
|
||||
auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
|
||||
@@ -336,7 +336,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
|
||||
MemoryOpsStep(warp_id);
|
||||
}
|
||||
|
||||
index_t num_compute_steps = amd_wave_read_first_lane(num_loop);
|
||||
index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop);
|
||||
while(num_compute_steps > 1)
|
||||
{
|
||||
block_sync_lds();
|
||||
|
||||
@@ -270,34 +270,34 @@ struct QuantGemmKernel
|
||||
const std::size_t k_id = blockIdx.z)
|
||||
{
|
||||
constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
|
||||
const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1);
|
||||
const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
|
||||
const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
|
||||
const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
|
||||
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
|
||||
{
|
||||
a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
|
||||
a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
|
||||
{
|
||||
a_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_A);
|
||||
a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
|
||||
{
|
||||
b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B);
|
||||
b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
|
||||
}
|
||||
else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
|
||||
{
|
||||
b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
|
||||
b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
}
|
||||
|
||||
if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(KRead);
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(KRead);
|
||||
}
|
||||
else
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -918,8 +918,8 @@ struct QuantGemmKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop =
|
||||
amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& a_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -981,10 +981,10 @@ struct QuantGemmKernel
|
||||
|
||||
CK_TILE_DEVICE void operator()(QuantGemmKernelArgs kargs) const
|
||||
{
|
||||
const auto blockId = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const SplitKBatchOffset splitk_batch_offset(kargs);
|
||||
// options
|
||||
|
||||
@@ -305,8 +305,8 @@ struct QuantGroupedGemmKernel
|
||||
{
|
||||
const auto [iM, iN] = block_idx_2d;
|
||||
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
|
||||
|
||||
|
||||
@@ -840,7 +840,7 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(
|
||||
gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1)));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
@@ -891,7 +891,7 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop = amd_wave_read_first_lane(
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1)));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
@@ -936,7 +936,7 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
|
||||
CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const
|
||||
{
|
||||
const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const index_t group_id = FindGroupId(kargs, blockIdX);
|
||||
|
||||
const auto [iM, iN] = OffsettedTile1DPartitioner<TilePartitioner>::GetOffsetedTileIndex(
|
||||
@@ -944,13 +944,13 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
kargs.c_grid_descs_m_n[group_id].get_length(I0),
|
||||
kargs.c_grid_descs_m_n[group_id].get_length(I1));
|
||||
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const auto blockIdY = amd_wave_read_first_lane(blockIdx.y);
|
||||
const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
|
||||
const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y);
|
||||
const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
|
||||
|
||||
// options
|
||||
// conv_bwd_data = Out * Weight = In
|
||||
|
||||
@@ -423,20 +423,22 @@ struct GroupedConvolutionBackwardWeightKernel
|
||||
__device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
|
||||
const std::size_t k_id = blockIdx.z)
|
||||
{
|
||||
constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
|
||||
const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1);
|
||||
const index_t KRead = amd_wave_read_first_lane((kargs.GemmK + K_t - 1) / K_t * K1);
|
||||
constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
|
||||
const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
|
||||
const index_t KRead =
|
||||
__builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1);
|
||||
|
||||
a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
|
||||
b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
|
||||
a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
|
||||
|
||||
if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(KRead);
|
||||
splitted_k = __builtin_amdgcn_readfirstlane(KRead);
|
||||
}
|
||||
else
|
||||
{
|
||||
splitted_k = amd_wave_read_first_lane(kargs.GemmK - KRead * (kargs.k_batch - 1));
|
||||
splitted_k =
|
||||
__builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -803,22 +805,22 @@ struct GroupedConvolutionBackwardWeightKernel
|
||||
|
||||
CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const
|
||||
{
|
||||
const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const auto [iM, iN] =
|
||||
TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z);
|
||||
const index_t num_loop = amd_wave_read_first_lane(
|
||||
const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z);
|
||||
const index_t num_loop = __builtin_amdgcn_readfirstlane(
|
||||
ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock));
|
||||
const index_t i_k =
|
||||
amd_wave_read_first_lane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
|
||||
__builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
|
||||
|
||||
const auto blockIdY = amd_wave_read_first_lane(blockIdx.y);
|
||||
const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
|
||||
const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y);
|
||||
const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
|
||||
|
||||
// options
|
||||
// conv_bwd_weight = Out * In = Weight
|
||||
|
||||
@@ -752,7 +752,8 @@ struct GroupedConvolutionForwardKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
|
||||
const index_t num_loop =
|
||||
__builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& a_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -801,7 +802,8 @@ struct GroupedConvolutionForwardKernel
|
||||
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
|
||||
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
|
||||
const index_t num_loop =
|
||||
__builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& a_block_window = gemm_tile_windows.at(I0);
|
||||
@@ -820,22 +822,22 @@ struct GroupedConvolutionForwardKernel
|
||||
|
||||
CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
|
||||
{
|
||||
const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
|
||||
const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
|
||||
const auto [iM, iN] =
|
||||
TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
const auto blockIdY = amd_wave_read_first_lane(blockIdx.y);
|
||||
const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
|
||||
const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y);
|
||||
const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
|
||||
const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
|
||||
const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
|
||||
|
||||
// Split-N handling: Get which split this workgroup handles
|
||||
const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z);
|
||||
const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z);
|
||||
|
||||
// Calculate batch offset for this split
|
||||
const index_t batch_offset = amd_wave_read_first_lane(blockIdZ * kargs.n_per_split);
|
||||
const index_t batch_offset = __builtin_amdgcn_readfirstlane(blockIdZ * kargs.n_per_split);
|
||||
|
||||
// Calculate memory offsets for this split
|
||||
const long_index_t input_batch_offset = static_cast<long_index_t>(batch_offset) *
|
||||
|
||||
@@ -175,9 +175,9 @@ struct ImageToColumn
|
||||
{
|
||||
const auto [M, K] = CalculateMKDims(kargs);
|
||||
|
||||
const index_t iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
|
||||
const index_t iK = amd_wave_read_first_lane(blockIdx.y * kKPerBlock);
|
||||
const index_t iBatch = amd_wave_read_first_lane(blockIdx.z);
|
||||
const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
|
||||
const index_t iK = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock);
|
||||
const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z);
|
||||
|
||||
const auto in_offset = iBatch * kargs.image_g_n_c_wis_strides[I0];
|
||||
const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0];
|
||||
|
||||
@@ -99,7 +99,7 @@ struct Layernorm2dFwdPipelineTwoPass
|
||||
// Problem::BlockShape
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
index_t num_n_tile_iteration =
|
||||
amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
|
||||
__builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
|
||||
|
||||
// total number of count assume current iter have no pad(only last iter has pad)
|
||||
constexpr index_t count_per_iter =
|
||||
@@ -119,7 +119,7 @@ struct Layernorm2dFwdPipelineTwoPass
|
||||
auto mean = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
|
||||
auto var = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto x = load_tile(x_window);
|
||||
auto x_resi = load_tile(x_residual_window);
|
||||
@@ -197,7 +197,7 @@ struct Layernorm2dFwdPipelineTwoPass
|
||||
move_tile_window(y_window, {0, stride_to_right_most_window});
|
||||
|
||||
// layernorm computation
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto acc = make_static_distributed_tensor<ComputeDataType>(
|
||||
decltype(load_tile(x_window))::get_tile_distribution());
|
||||
|
||||
@@ -156,7 +156,7 @@ struct Reduce
|
||||
const auto merged_reduce_len =
|
||||
transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
|
||||
index_t num_n_tile_iteration =
|
||||
amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N));
|
||||
__builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
|
||||
|
||||
auto block_reduce2d = Policy::template GetBlockReduce2d<Problem>();
|
||||
auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
|
||||
@@ -167,7 +167,7 @@ struct Reduce
|
||||
auto y_compute = block_reduce2d.template MakeYBlockTile<XTensorType>();
|
||||
set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto x = load_tile(x_window);
|
||||
block_reduce2d(x, y_compute, reduce_func);
|
||||
|
||||
@@ -82,7 +82,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
|
||||
// Problem::BlockShape
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
index_t num_n_tile_iteration =
|
||||
amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
|
||||
__builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
|
||||
|
||||
auto reduce_square_sum_func = ReduceOp::SquareAdd{};
|
||||
auto reduce_sum_func = ReduceOp::Add{};
|
||||
@@ -95,7 +95,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
|
||||
auto square_sum = block_reduce2d.template MakeYBlockTile<ComputeTensorType>();
|
||||
set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto x = load_tile(x_window);
|
||||
auto x_resi = load_tile(x_residual_window);
|
||||
@@ -151,7 +151,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
|
||||
move_tile_window(y_window, {0, stride_to_right_most_window});
|
||||
|
||||
// rmsnorm computation
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
auto acc = make_static_distributed_tensor<ComputeDataType>(
|
||||
decltype(load_tile(x_window))::get_tile_distribution());
|
||||
|
||||
@@ -138,7 +138,7 @@ struct MoeSmoothquant
|
||||
const index_t i_topk = blockIdx.x;
|
||||
const index_t i_token = blockIdx.y * Block_M;
|
||||
const index_t i_token_in_thrd =
|
||||
amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
|
||||
__builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
|
||||
|
||||
const index_t i_expert = reinterpret_cast<const index_t*>(
|
||||
kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
|
||||
|
||||
@@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass
|
||||
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
index_t num_n_tile_iteration =
|
||||
amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
|
||||
__builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
|
||||
|
||||
auto reduce_absmax_func = ReduceOp::AbsMax{};
|
||||
auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
|
||||
@@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass
|
||||
auto absmax = block_reduce2d.template MakeYBlockTile<XTensorType>();
|
||||
set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
|
||||
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto x = load_tile(x_window);
|
||||
const auto smscale = load_tile(smscale_window);
|
||||
@@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass
|
||||
move_tile_window(qy_window, {0, stride_to_right_most_window});
|
||||
|
||||
// recompute y and quantize y to qy
|
||||
for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto x = load_tile(x_window);
|
||||
const auto smscale = load_tile(smscale_window);
|
||||
|
||||
@@ -96,9 +96,9 @@ struct TopkSoftmaxKernel
|
||||
if(block_row_id > kargs.num_rows)
|
||||
return;
|
||||
|
||||
index_t block_os_inp = amd_wave_read_first_lane(block_row_id * kargs.stride_input);
|
||||
index_t block_os_out = amd_wave_read_first_lane(block_row_id * kargs.stride_output);
|
||||
index_t num_rows_rem = amd_wave_read_first_lane(kargs.num_rows - block_row_id);
|
||||
index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
|
||||
index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
|
||||
index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
|
||||
|
||||
const auto input_window = [&]() {
|
||||
const InputType* p_input =
|
||||
|
||||
Reference in New Issue
Block a user