diff --git a/CHANGELOG.md b/CHANGELOG.md index fe1e7ef345..f21795012d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,6 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj ### Added * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM. -* Added the new api to load different memory sizes to SGPR. * Added support for B Tensor Preshuffle in CK TILE Grouped GEMM. * Added a basic copy kernel example and supporting documentation for new CK Tile developers. * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index d298311409..7a9c017eb2 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2829,60 +2829,6 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) } #endif -// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the -// memory to the SGPR registers. -__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -__device__ inline int32_t amd_wave_read_first_lane(int32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -template , int> = 0> -__device__ inline auto amd_wave_read_first_lane(const Object& obj) -{ - constexpr size_t ObjectSize = sizeof(Object); - constexpr size_t SGPR_size = 4; - constexpr size_t NumFull = ObjectSize / SGPR_size; - constexpr size_t Tail = ObjectSize % SGPR_size; - - const unsigned char* src = reinterpret_cast(&obj); - alignas(Object) unsigned char dst[ObjectSize]; - - static_for<0, NumFull, 1>{}([&](auto Ic) { - constexpr size_t offset = Ic * SGPR_size; - uint32_t read_src; - __builtin_memcpy(&read_src, src + offset, SGPR_size); - read_src = __builtin_amdgcn_readfirstlane(read_src); - __builtin_memcpy(dst + offset, &read_src, SGPR_size); - }); - - if constexpr(Tail != 0) - { - constexpr size_t offset = NumFull * SGPR_size; - uint32_t tail_loc = 0; - __builtin_memcpy(&tail_loc, src + offset, Tail); - tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); - __builtin_memcpy(dst + offset, &tail_loc, Tail); - } - Object out; - __builtin_memcpy(&out, dst, ObjectSize); - return out; -} - } // namespace ck_tile #endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index d3e64eddac..5c7ffefc6a 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -2585,8 +2585,9 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; #if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM - T* lds_ptr = lds_base_ptr + lds_offset; - auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast(lds_ptr))); + T* lds_ptr = lds_base_ptr + lds_offset; + auto const lds_ptr_sgpr = + __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), @@ -2659,60 +2660,6 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) } #endif -// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the -// memory to the SGPR registers. -__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) -{ - return __builtin_amdgcn_readfirstlane(static_cast(v)); -} - -__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -__device__ inline int32_t amd_wave_read_first_lane(int32_t value) -{ - return __builtin_amdgcn_readfirstlane(value); -} - -template , int> = 0> -__device__ inline auto amd_wave_read_first_lane(const Object& obj) -{ - constexpr size_t ObjectSize = sizeof(Object); - constexpr size_t SGPR_size = 4; - constexpr size_t NumFull = ObjectSize / SGPR_size; - constexpr size_t Tail = ObjectSize % SGPR_size; - - const unsigned char* src = reinterpret_cast(&obj); - alignas(Object) unsigned char dst[ObjectSize]; - - static_for<0, NumFull, 1>{}([&](auto Ic) { - constexpr size_t offset = Ic * SGPR_size; - uint32_t read_src; - __builtin_memcpy(&read_src, src + offset, SGPR_size); - read_src = __builtin_amdgcn_readfirstlane(read_src); - __builtin_memcpy(dst + offset, &read_src, SGPR_size); - }); - - if constexpr(Tail != 0) - { - constexpr size_t offset = NumFull * SGPR_size; - uint32_t tail_loc = 0; - __builtin_memcpy(&tail_loc, src + offset, Tail); - tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); - __builtin_memcpy(dst + offset, &tail_loc, Tail); - } - Object out; - __builtin_memcpy(&out, dst, ObjectSize); - return out; -} - } // namespace ck_tile #endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 28ded5439a..42f2390cde 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -9,8 +9,6 @@ #include "ck_tile/core/config.hpp" #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/numeric/integral_constant.hpp" -#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp" -#include "ck_tile/core/arch/amd_buffer_addressing.hpp" #include "ck_tile/core/utility/ignore.hpp" #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111 @@ -106,7 +104,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant = {}) const index_t warp_id = threadIdx.x / get_warp_size(); if constexpr(ReturnSgpr) { - return amd_wave_read_first_lane(warp_id); + return __builtin_amdgcn_readfirstlane(warp_id); } else { diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index 2db5d719c0..b45106487e 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -402,7 +402,7 @@ struct tile_window_with_static_distribution const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant{}); m0_set_with_memory( - amd_wave_read_first_lane(m0_init_value)); // This should be wave independent + __builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent using Traits = typename Base::Traits; diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp index 052ee4ae62..ecd4e81b22 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp @@ -92,13 +92,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); using XTensorType = decltype(cast_tile(load_tile(a_window))); auto square_sum = block_reduce2d.template MakeYBlockTile(); set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { const auto a = load_tile(a_window); const auto b = load_tile(b_window); @@ -149,7 +149,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass if constexpr(kSaveX) __syncthreads(); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto x = [&]() { if constexpr(kSaveX) @@ -226,7 +226,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass } move_tile_window(gamma_window, {Block_N}); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto x = [&]() { if constexpr(kSaveX) diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp index c99571562d..b0f48f6c5b 100644 --- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp +++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp @@ -84,9 +84,9 @@ struct BatchedTransposeKernel static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; static constexpr ck_tile::index_t VectorStrideOutput = 1; - const auto iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock); - const auto iN = amd_wave_read_first_lane(blockIdx.y * kNPerBlock); - const auto offset = amd_wave_read_first_lane(blockIdx.z * kargs.height * kargs.width); + const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); + const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); + const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width); const auto x_m_n = [&]() { const auto x_dram_naive = make_naive_tensor_view( diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp index ab0b310510..a924279d52 100644 --- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp @@ -598,8 +598,8 @@ struct FlatmmKernel CK_TILE_DEVICE void operator()(KernelArgs kargs) const { const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); // options diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp index 56865498c0..fcd512056d 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp @@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_bias = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index 327b41b071..b234d6944e 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel // divide problem const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex(); - const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0); + const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; @@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel // divide problem const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex(); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0); long_index_t batch_offset_o = 0; long_index_t batch_offset_do = 0; @@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel // divide problem const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex(); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0); long_index_t batch_offset_dq = 0; long_index_t batch_offset_dq_acc = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp index a82d121d62..66f51459af 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp @@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel // divide problem const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0); - const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0); + const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0); const index_t i_cache_batch = [&, i_batch_ = i_batch] { if constexpr(kIsPagedKV) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 98ff31a96f..e562f6dd5a 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -1060,8 +1060,8 @@ struct FmhaFwdKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp index 62ac70db92..58ef6ba87e 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp @@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index a6fc0f1471..cf819c4b8d 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_lse_acc = 0; long_index_t batch_offset_o_acc = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 80de65ead4..9293c97a31 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel // divide problem const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; // unused for paged-kvcache diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp index abf9bf0aec..c5e5745817 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp @@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp index b01c127a21..9d267e1cee 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp @@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS k_block_tile = load_tile(k_dram_window); } auto physical_next_block_id_k = - amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id( + __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id( i_page_block_k, k_dram_block_window, {kN0, 0})); - auto physical_next_block_id_v = amd_wave_read_first_lane( + auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane( v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1})); if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index fe5e0bc345..9de640b7cf 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS k_block_tile = load_tile(k_dram_window); } auto physical_next_block_id_k = - amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id( + __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id( i_page_block_k, k_dram_block_window, {kN0, 0})); - auto physical_next_block_id_v = amd_wave_read_first_lane( + auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane( v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1})); if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) @@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS &i_page_block_v_ = i_page_block_v, &v_dram_window_ = v_dram_window](auto i_k1) { auto physical_next_block_id_v_ = - amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id( + __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id( i_page_block_v_, v_dram_window_, {0, kK1})); const auto v = load_tile(v_dram_window_); // load next v block_sync_lds(); diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp index c69c15a2b0..6d95decaee 100644 --- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp @@ -240,7 +240,7 @@ struct FusedMoeGemmKernel if constexpr(UseUK) { __shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()]; - IndexDataType num_sorted_tiles = amd_wave_read_first_lane( + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( *reinterpret_cast(kargs.num_sorted_tiles_ptr)); num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0; @@ -261,7 +261,7 @@ struct FusedMoeGemmKernel { // allocate LDS // __shared__ char smem_ptr[GetSmemSize()]; - IndexDataType num_sorted_tiles = amd_wave_read_first_lane( + IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( *reinterpret_cast(kargs.num_sorted_tiles_ptr)); constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; @@ -283,14 +283,14 @@ struct FusedMoeGemmKernel return; const IndexDataType expert_id = - amd_wave_read_first_lane(reinterpret_cast( + __builtin_amdgcn_readfirstlane(reinterpret_cast( kargs.sorted_expert_ids_ptr)[sorted_tile_id]); // index along intermediate_size // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id * // BlockShape::Block_N0); index_t interm_idx_nr = - amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0); + __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0); const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col] const auto sorted_token_id = diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index 28416ec538..faeb5cf6b3 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -756,7 +756,7 @@ struct MoeSortingKernel void* smem) const { const index_t tid = static_cast(threadIdx.x); - const index_t wid = amd_wave_read_first_lane(tid / get_warp_size()); + const index_t wid = __builtin_amdgcn_readfirstlane(tid / get_warp_size()); const index_t lid = __lane_id(); constexpr index_t block_size = 256; // blockDim.x; const index_t sub_tokens = smem_rows - 2; // sub_tokens_mdiv.divisor; diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp index d19f0894b9..38410721ae 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp @@ -184,17 +184,17 @@ struct FusedMoeGemmPipeline_FlatmmUk index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1; index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1; - const IndexDataType expert_id = amd_wave_read_first_lane( + const IndexDataType expert_id = __builtin_amdgcn_readfirstlane( reinterpret_cast(kargs.sorted_expert_ids_ptr)[sorted_tile_id]); index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size; index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size; // nr*kr*w - index_t interm_idx_nr0 = amd_wave_read_first_lane( + index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane( intermediate_tile_id * BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W) - index_t interm_idx_kr1 = amd_wave_read_first_lane( + index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane( intermediate_tile_id * BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W) diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 6f9d53467f..588d903b25 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -169,27 +169,27 @@ struct BatchedGemmKernel CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const { const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); - const auto i_batch = amd_wave_read_first_lane(blockIdx.y); - const auto i_splitk = amd_wave_read_first_lane(blockIdx.z); + const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.y); + const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z); const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk); // options - const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A); - const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A); + const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); + const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); const ADataType* a_ptr = static_cast(kargs.as_ptr[0]) + batch_offset_A + splitk_batch_offset.as_k_split_offset[0]; - const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B); - const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B); + const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); + const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); const BDataType* b_ptr = static_cast(kargs.bs_ptr[0]) + batch_offset_B + splitk_batch_offset.bs_k_split_offset[0]; - const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E); - const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E); + const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E); + const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E); CDataType* c_ptr = static_cast(kargs.e_ptr) + batch_offset_C; // allocate LDS diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index 673f5abc34..a891d4df55 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -73,8 +73,8 @@ struct GemmTile2DPartitioner CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple { - const index_t iM = amd_wave_read_first_lane(blockIdx); - const index_t iN = amd_wave_read_first_lane(blockIdy); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy); return make_tuple(iM, iN); } }; @@ -143,8 +143,8 @@ struct GemmTile1DPartitioner { const index_t NBlocks = integer_divide_ceil(N_, NPerBlock); - const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks); - const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks); + const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks); return make_tuple(iM, iN); } diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index cf9ba31943..df1d6c9e4f 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -272,8 +272,8 @@ struct GroupedGemmKernel const auto [iM, iN] = block_idx_2d; - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z); @@ -358,8 +358,8 @@ struct GroupedGemmKernel const auto& d_block_window = gemm_tile_windows.at(Base::I2); // Get hot-loop and tail configuration - const index_t num_loop = - amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop); const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); @@ -416,8 +416,8 @@ struct GroupedGemmKernel const auto& d_block_window = gemm_tile_windows.at(Base::I2); // Get hot-loop and tail configuration - const index_t num_loop = - amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); // Run GEMM pipeline with compile-time branching diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp index ad85b5392d..5df1f092d7 100644 --- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp @@ -271,8 +271,8 @@ struct StreamKKernel uint32_t block_idx = ck_tile::get_block_1d_id(); bool is_padding_block = - amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks && - block_idx < kargs.tile_partitioner.dp_start_block_idx); + __builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks && + block_idx < kargs.tile_partitioner.dp_start_block_idx); // Padding blocks make it such that the DP blocks are aligned with the number of CUs; they // should not partake in the GEMM @@ -289,7 +289,7 @@ struct StreamKKernel { // Determine the number of macro tiles in A and B this WG is resposible for in the // current C macro tile. - uint32_t current_iter_length = amd_wave_read_first_lane( + uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end)); // Determine the 1D tile_idx and the iter_offset for this WG. diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp index 51ad4e3dd1..8f44108cc4 100644 --- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp @@ -326,19 +326,19 @@ struct UniversalGemmKernel __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z) { constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); - const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); - const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1); + const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); + const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1); static_for<0, NumATensor, 1>{}([&](auto index) { using AiLayout = remove_cvref_t>; if constexpr(std::is_same_v) { - as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead); + as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead); } else if constexpr(std::is_same_v) { as_k_split_offset[index] = - amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]); + __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]); } }); @@ -347,21 +347,21 @@ struct UniversalGemmKernel if constexpr(std::is_same_v) { bs_k_split_offset[index] = - amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]); + __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]); } else if constexpr(std::is_same_v) { - bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead); + bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead); } }); if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = amd_wave_read_first_lane(KRead); + splitted_k = __builtin_amdgcn_readfirstlane(KRead); } else { - splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1)); + splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1)); } } @@ -970,8 +970,8 @@ struct UniversalGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = - amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& as_block_window = gemm_tile_windows.at(I0); @@ -1026,8 +1026,8 @@ struct UniversalGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = - amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& as_block_window = gemm_tile_windows.at(I0); @@ -1052,10 +1052,10 @@ struct UniversalGemmKernel template > CK_TILE_DEVICE void operator()(KernelArgs kargs) const { - const auto blockId = amd_wave_read_first_lane(blockIdx.x); + const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); @@ -1126,22 +1126,22 @@ struct UniversalGemmKernel template , typename = void> CK_TILE_DEVICE void operator()(KernelArgs kargs) const { - const auto grid_size = amd_wave_read_first_lane(get_grid_size()); + const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size()); const auto num_tiles = - amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N)); - const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch); - auto block_id = amd_wave_read_first_lane(get_block_id()); + __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N)); + const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch); + auto block_id = __builtin_amdgcn_readfirstlane(get_block_id()); while(block_id < num_work) { // Get the tile index for this block - const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles); + const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); // Get the SplitK offset for this block - const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles); + const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles); const SplitKBatchOffset splitk_batch_offset(kargs, k_batch); std::array as_ptr; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index d0466bc8b1..b362f751c6 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -487,7 +487,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 if(HasHotLoop) { // minus 2 because we have ping-pong double buffer. - index_t iCounter = amd_wave_read_first_lane(num_loop - 2); + index_t iCounter = __builtin_amdgcn_readfirstlane(num_loop - 2); do { // ping diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp index 7263ddd5a1..474d1a5a21 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp @@ -178,7 +178,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 index_t warp_id = get_warp_id(); index_t operation_id = - amd_wave_read_first_lane(get_warp_id()); // 0 - Memory read, 1 - block-gemm + __builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock); auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock); @@ -336,7 +336,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 MemoryOpsStep(warp_id); } - index_t num_compute_steps = amd_wave_read_first_lane(num_loop); + index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop); while(num_compute_steps > 1) { block_sync_lds(); diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp index bcd0fd9dac..82bf75a9e3 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp @@ -270,34 +270,34 @@ struct QuantGemmKernel const std::size_t k_id = blockIdx.z) { constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(I2); - const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); - const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1); + const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); + const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1); if constexpr(std::is_same_v) { - a_k_split_offset = amd_wave_read_first_lane(k_id * KRead); + a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); } else if constexpr(std::is_same_v) { - a_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_A); + a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A); } if constexpr(std::is_same_v) { - b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B); + b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B); } else if constexpr(std::is_same_v) { - b_k_split_offset = amd_wave_read_first_lane(k_id * KRead); + b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); } if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = amd_wave_read_first_lane(KRead); + splitted_k = __builtin_amdgcn_readfirstlane(KRead); } else { - splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1)); + splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1)); } } @@ -918,8 +918,8 @@ struct QuantGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = - amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = __builtin_amdgcn_readfirstlane( + TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -981,10 +981,10 @@ struct QuantGemmKernel CK_TILE_DEVICE void operator()(QuantGemmKernelArgs kargs) const { - const auto blockId = amd_wave_read_first_lane(blockIdx.x); + const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); // options diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp index 39c8e406b7..07c45117e2 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp @@ -305,8 +305,8 @@ struct QuantGroupedGemmKernel { const auto [iM, iN] = block_idx_2d; - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index e68a510a0c..15e697afdf 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -840,7 +840,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum( + const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum( gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1))); // Run GEMM cooperatively by whole workgroup. @@ -891,7 +891,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = amd_wave_read_first_lane( + const index_t num_loop = __builtin_amdgcn_readfirstlane( TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1))); // Run GEMM cooperatively by whole workgroup. @@ -936,7 +936,7 @@ struct GroupedConvolutionBackwardDataKernel CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const { - const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); + const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); const index_t group_id = FindGroupId(kargs, blockIdX); const auto [iM, iN] = OffsettedTile1DPartitioner::GetOffsetedTileIndex( @@ -944,13 +944,13 @@ struct GroupedConvolutionBackwardDataKernel kargs.c_grid_descs_m_n[group_id].get_length(I0), kargs.c_grid_descs_m_n[group_id].get_length(I1)); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); - const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); - const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); + const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); + const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); // options // conv_bwd_data = Out * Weight = In diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index b85660aea3..7bb3fedaf6 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -423,20 +423,22 @@ struct GroupedConvolutionBackwardWeightKernel __device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs, const std::size_t k_id = blockIdx.z) { - constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); - const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); - const index_t KRead = amd_wave_read_first_lane((kargs.GemmK + K_t - 1) / K_t * K1); + constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); + const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); + const index_t KRead = + __builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1); - a_k_split_offset = amd_wave_read_first_lane(k_id * KRead); - b_k_split_offset = amd_wave_read_first_lane(k_id * KRead); + a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); + b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = amd_wave_read_first_lane(KRead); + splitted_k = __builtin_amdgcn_readfirstlane(KRead); } else { - splitted_k = amd_wave_read_first_lane(kargs.GemmK - KRead * (kargs.k_batch - 1)); + splitted_k = + __builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1)); } } @@ -803,22 +805,22 @@ struct GroupedConvolutionBackwardWeightKernel CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const { - const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); + const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); - const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z); - const index_t num_loop = amd_wave_read_first_lane( + const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z); + const index_t num_loop = __builtin_amdgcn_readfirstlane( ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock)); const index_t i_k = - amd_wave_read_first_lane(blockIdZ * num_loop * TilePartitioner::KPerBlock); + __builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock); - const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); - const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); + const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); + const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); // options // conv_bwd_weight = Out * In = Weight diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 0363782d33..d1eacd60cd 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -752,7 +752,8 @@ struct GroupedConvolutionForwardKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK)); + const index_t num_loop = + __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -801,7 +802,8 @@ struct GroupedConvolutionForwardKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK)); + const index_t num_loop = + __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -820,22 +822,22 @@ struct GroupedConvolutionForwardKernel CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const { - const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); + const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX); - const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); - const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); + const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); + const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); - const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); - const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); + const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); + const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); // Split-N handling: Get which split this workgroup handles - const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z); + const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z); // Calculate batch offset for this split - const index_t batch_offset = amd_wave_read_first_lane(blockIdZ * kargs.n_per_split); + const index_t batch_offset = __builtin_amdgcn_readfirstlane(blockIdZ * kargs.n_per_split); // Calculate memory offsets for this split const long_index_t input_batch_offset = static_cast(batch_offset) * diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp index bc20057e7a..eb54807d88 100644 --- a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp +++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp @@ -175,9 +175,9 @@ struct ImageToColumn { const auto [M, K] = CalculateMKDims(kargs); - const index_t iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock); - const index_t iK = amd_wave_read_first_lane(blockIdx.y * kKPerBlock); - const index_t iBatch = amd_wave_read_first_lane(blockIdx.z); + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); + const index_t iK = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock); + const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z); const auto in_offset = iBatch * kargs.image_g_n_c_wis_strides[I0]; const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0]; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index 422950b143..0de1ada87c 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -99,7 +99,7 @@ struct Layernorm2dFwdPipelineTwoPass // Problem::BlockShape static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); // total number of count assume current iter have no pad(only last iter has pad) constexpr index_t count_per_iter = @@ -119,7 +119,7 @@ struct Layernorm2dFwdPipelineTwoPass auto mean = block_norm_reduce.template MakeMeanVarBlockTile(); auto var = block_norm_reduce.template MakeMeanVarBlockTile(); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto x = load_tile(x_window); auto x_resi = load_tile(x_residual_window); @@ -197,7 +197,7 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(y_window, {0, stride_to_right_most_window}); // layernorm computation - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto acc = make_static_distributed_tensor( decltype(load_tile(x_window))::get_tile_distribution()); diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp index 83a22aaded..92a71a42c8 100644 --- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp +++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp @@ -156,7 +156,7 @@ struct Reduce const auto merged_reduce_len = transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{}); index_t num_n_tile_iteration = - amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N)); + __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N)); auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); @@ -167,7 +167,7 @@ struct Reduce auto y_compute = block_reduce2d.template MakeYBlockTile(); set_tile(y_compute, reduce_func.template GetIdentityValue()); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); block_reduce2d(x, y_compute, reduce_func); diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp index ca3cdc37c4..d01f37879a 100644 --- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp @@ -82,7 +82,7 @@ struct Rmsnorm2dFwdPipelineTwoPass // Problem::BlockShape static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; @@ -95,7 +95,7 @@ struct Rmsnorm2dFwdPipelineTwoPass auto square_sum = block_reduce2d.template MakeYBlockTile(); set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto x = load_tile(x_window); auto x_resi = load_tile(x_residual_window); @@ -151,7 +151,7 @@ struct Rmsnorm2dFwdPipelineTwoPass move_tile_window(y_window, {0, stride_to_right_most_window}); // rmsnorm computation - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { auto acc = make_static_distributed_tensor( decltype(load_tile(x_window))::get_tile_distribution()); diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp index f6c7c0753a..2553b19fd8 100644 --- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp +++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp @@ -138,7 +138,7 @@ struct MoeSmoothquant const index_t i_topk = blockIdx.x; const index_t i_token = blockIdx.y * Block_M; const index_t i_token_in_thrd = - amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); + __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); const index_t i_expert = reinterpret_cast( kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk]; diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp index 8b0a7274ed..ba9c6374f1 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp @@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); + __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); auto reduce_absmax_func = ReduceOp::AbsMax{}; auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { @@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass auto absmax = block_reduce2d.template MakeYBlockTile(); set_tile(absmax, reduce_absmax_func.GetIdentityValue()); - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); const auto smscale = load_tile(smscale_window); @@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass move_tile_window(qy_window, {0, stride_to_right_most_window}); // recompute y and quantize y to qy - for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); const auto smscale = load_tile(smscale_window); diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp index e8727ea065..277049f6b0 100644 --- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp +++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp @@ -96,9 +96,9 @@ struct TopkSoftmaxKernel if(block_row_id > kargs.num_rows) return; - index_t block_os_inp = amd_wave_read_first_lane(block_row_id * kargs.stride_input); - index_t block_os_out = amd_wave_read_first_lane(block_row_id * kargs.stride_output); - index_t num_rows_rem = amd_wave_read_first_lane(kargs.num_rows - block_row_id); + index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input); + index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output); + index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id); const auto input_window = [&]() { const InputType* p_input =