From b8448ab68d00909f684afa7f0850d8f5c6987163 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Thu, 25 Sep 2025 18:15:45 +0000 Subject: [PATCH] Merge commit '8c1a95991330118930f23e6a2ba8e76068d8ca22' into develop --- CHANGELOG.md | 1 + Jenkinsfile | 8 +-- .../quant_run_grouped_gemm_example.inc | 4 +- .../run_grouped_gemm_example.inc | 4 +- .../21_elementwise/elementwise_example.cpp | 4 +- .../elementwise_example_add_4d.cpp | 4 +- .../elementwise_example_transpose.cpp | 4 +- .../elementwise_example_unary.cpp | 4 +- include/ck/utility/amd_ck_fp8.hpp | 8 +-- .../core/arch/amd_buffer_addressing.hpp | 56 +++++++++++++++++- .../arch/amd_buffer_addressing_builtins.hpp | 7 +-- include/ck_tile/core/arch/arch.hpp | 4 +- include/ck_tile/core/tensor/buffer_view.hpp | 7 +-- include/ck_tile/core/tensor/tile_window.hpp | 2 +- ...norm2d_rdquant_fwd_pipeline_three_pass.hpp | 8 +-- .../kernel/batched_transpose_kernel.hpp | 6 +- .../ops/epilogue/cshuffle_epilogue.hpp | 46 +++++++-------- .../ops/flatmm/kernel/flatmm_kernel.hpp | 4 +- .../fmha/kernel/fmha_batch_prefill_kernel.hpp | 4 +- .../ops/fmha/kernel/fmha_bwd_kernel.hpp | 6 +- .../fmha/kernel/fmha_fwd_appendkv_kernel.hpp | 4 +- .../ops/fmha/kernel/fmha_fwd_kernel.hpp | 4 +- .../fmha/kernel/fmha_fwd_pagedkv_kernel.hpp | 4 +- .../fmha_fwd_splitkv_combine_kernel.hpp | 4 +- .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp | 4 +- .../ops/fmha/kernel/fmha_fwd_v3_kernel.hpp | 4 +- ...ock_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp | 4 +- ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp | 6 +- .../fused_moe/kernel/fused_moegemm_kernel.hpp | 8 +-- .../fused_moe/kernel/moe_sorting_kernel.hpp | 2 +- .../fused_moegemm_pipeline_flatmm_uk.hpp | 6 +- .../ops/gemm/kernel/batched_gemm_kernel.hpp | 20 +++---- .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 8 +-- .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 12 ++-- .../ops/gemm/kernel/streamk_gemm_kernel.hpp | 6 +- .../ops/gemm/kernel/universal_gemm_kernel.hpp | 46 +++++++-------- .../gemm_pipeline_ag_bg_cr_comp_v4.hpp | 2 +- .../gemm_pipeline_ag_bg_cr_comp_v5.hpp | 4 +- .../gemm_quant/kernel/gemm_quant_kernel.hpp | 59 ++++++++----------- .../kernel/grouped_gemm_quant_kernel.hpp | 4 +- ...ouped_convolution_backward_data_kernel.hpp | 18 +++--- ...ped_convolution_backward_weight_kernel.hpp | 36 ++++++----- .../grouped_convolution_forward_kernel.hpp | 24 ++++---- .../kernel/image_to_column_kernel.hpp | 6 +- .../layernorm2d_fwd_pipeline_two_pass.hpp | 6 +- .../ops/reduce/kernel/reduce2d_kernel.hpp | 4 +- .../rmsnorm2d_fwd_pipeline_two_pass.hpp | 6 +- .../kernel/moe_smoothquant_kernel.hpp | 2 +- .../smoothquant_pipeline_two_pass.hpp | 6 +- .../kernel/topk_softmax_kernel.hpp | 6 +- .../test_gemm_multi_abd_ut_cases_cshuffle.inc | 1 - 51 files changed, 281 insertions(+), 236 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f21795012d..fe1e7ef345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj ### Added * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM. +* Added the new api to load different memory sizes to SGPR. * Added support for B Tensor Preshuffle in CK TILE Grouped GEMM. * Added a basic copy kernel example and supporting documentation for new CK Tile developers. * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data diff --git a/Jenkinsfile b/Jenkinsfile index 2866b7d84e..b18c2939dc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -859,6 +859,7 @@ def run_aiter_tests(Map conf=[:]){ sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py" + sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py" sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py" @@ -930,7 +931,7 @@ def run_pytorch_tests(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true 0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true @@ -1351,7 +1352,6 @@ pipeline { } agent{ label rocmnode("gfx950") } environment{ - def docker_name = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1" setup_args = "NO_CK_BUILD" execute_args = """ ../script/cmake-ck-dev.sh ../ gfx950 && \ make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \ @@ -1359,7 +1359,7 @@ pipeline { example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """ } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, docker_name: docker_name, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } @@ -1567,7 +1567,7 @@ pipeline { -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ } steps{ - Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') + Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') cleanWs() } } diff --git a/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc index 658a4dfa62..10d317a2c7 100644 --- a/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc +++ b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc @@ -183,7 +183,7 @@ int run_grouped_gemm_example_with_layouts(int argc, if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs)) { std::cout << "Please check the input data. Default values will be used." << std::endl; - + // Clear existing (invalid) data before adding defaults Ms.clear(); Ns.clear(); @@ -193,7 +193,7 @@ int run_grouped_gemm_example_with_layouts(int argc, stride_Cs.clear(); stride_AQs.clear(); stride_BQs.clear(); - + for(int i = 0; i < group_count; i++) { Ms.push_back(256 + 256 * i); diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc index 026f2bd8f6..b1aa832e72 100644 --- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc @@ -172,7 +172,7 @@ int run_grouped_gemm_example_with_layouts(int argc, std::cout << "Default values: Ms (256, 512, 768, 1024..), Ns (256, 768, 1280..), Ks (512, " "896, 1280..)" << std::endl; - + // Clear existing (invalid) data before adding defaults Ms.clear(); Ns.clear(); @@ -180,7 +180,7 @@ int run_grouped_gemm_example_with_layouts(int argc, stride_As.clear(); stride_Bs.clear(); stride_Cs.clear(); - + for(int i = 0; i < group_count; i++) { Ms.push_back(256 + 256 * i); diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp index 94d3e70be1..e9fbeafde1 100644 --- a/example/ck_tile/21_elementwise/elementwise_example.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example.cpp @@ -211,7 +211,9 @@ bool run(const ck_tile::ArgParser& arg_parser) int main(int argc, char* argv[]) { - auto [result, arg_parser] = create_args(argc, argv); + bool result = true; + ck_tile::ArgParser arg_parser; + std::tie(result, arg_parser) = create_args(argc, argv); if(!result) return -1; diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp index ff7ec1517e..1b101c2e5f 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp @@ -157,7 +157,9 @@ bool run(const ck_tile::ArgParser& arg_parser) int main(int argc, char* argv[]) { - auto [result, arg_parser] = create_args(argc, argv); + bool result = true; + ck_tile::ArgParser arg_parser; + std::tie(result, arg_parser) = create_args(argc, argv); if(!result) return -1; diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp index 16e9832c07..7cdb5cc0d1 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp @@ -156,7 +156,9 @@ bool run(const ck_tile::ArgParser& arg_parser) int main(int argc, char* argv[]) { - auto [result, arg_parser] = create_args(argc, argv); + bool result = true; + ck_tile::ArgParser arg_parser; + std::tie(result, arg_parser) = create_args(argc, argv); if(!result) return -1; diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp index c5a08d910e..4e19cfd688 100644 --- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp +++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp @@ -193,7 +193,9 @@ auto string_to_op(const std::string& op) int main(int argc, char* argv[]) { - auto [result, arg_parser] = create_args(argc, argv); + bool result = true; + ck_tile::ArgParser arg_parser; + std::tie(result, arg_parser) = create_args(argc, argv); if(!result) return -1; diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp index 2c00f4f42f..c5525d5ff8 100644 --- a/include/ck/utility/amd_ck_fp8.hpp +++ b/include/ck/utility/amd_ck_fp8.hpp @@ -34,8 +34,8 @@ namespace ck { struct f8_fnuz_t { - using data_type = unsigned char; - data_type m_data; + using data_type = unsigned char; + data_type m_data = data_type{}; __host__ __device__ explicit constexpr f8_fnuz_t(data_type in_data) : m_data(in_data) {} __host__ __device__ explicit constexpr f8_fnuz_t() = default; __host__ __device__ bool constexpr operator==(f8_fnuz_t other) const @@ -47,8 +47,8 @@ struct f8_fnuz_t struct bf8_fnuz_t { - using data_type = unsigned char; - data_type m_data; + using data_type = unsigned char; + data_type m_data = data_type{}; __host__ __device__ explicit constexpr bf8_fnuz_t(data_type in_data) : m_data(in_data) {} __host__ __device__ explicit constexpr bf8_fnuz_t() = default; __host__ __device__ bool constexpr operator==(bf8_fnuz_t other) const diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 7a9c017eb2..7bc5ca5df8 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2788,7 +2788,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ } #if defined(__gfx950__) -template +template __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) { #define __LDS_ADDR __attribute__((address_space(3))) @@ -2829,6 +2829,60 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) } #endif +// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the +// memory to the SGPR registers. +__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v) +{ + return __builtin_amdgcn_readfirstlane(static_cast(v)); +} + +__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +__device__ inline int32_t amd_wave_read_first_lane(int32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +template , int> = 0> +__device__ inline auto amd_wave_read_first_lane(const Object& obj) +{ + constexpr size_t ObjectSize = sizeof(Object); + constexpr size_t SGPR_size = 4; + constexpr size_t NumFull = ObjectSize / SGPR_size; + constexpr size_t Tail = ObjectSize % SGPR_size; + + const unsigned char* src = reinterpret_cast(&obj); + alignas(Object) unsigned char dst[ObjectSize]; + + static_for<0, NumFull, 1>{}([&](auto Ic) { + constexpr size_t offset = Ic * SGPR_size; + uint32_t read_src; + __builtin_memcpy(&read_src, src + offset, SGPR_size); + read_src = __builtin_amdgcn_readfirstlane(read_src); + __builtin_memcpy(dst + offset, &read_src, SGPR_size); + }); + + if constexpr(Tail != 0) + { + constexpr size_t offset = NumFull * SGPR_size; + uint32_t tail_loc = 0; + __builtin_memcpy(&tail_loc, src + offset, Tail); + tail_loc = __builtin_amdgcn_readfirstlane(tail_loc); + __builtin_memcpy(dst + offset, &tail_loc, Tail); + } + Object out; + __builtin_memcpy(&out, dst, ObjectSize); + return out; +} + } // namespace ck_tile #endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 4e0a86119a..ce5a8075df 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -2639,9 +2639,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; #if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM - T* lds_ptr = lds_base_ptr + lds_offset; - auto const lds_ptr_sgpr = - __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); + T* lds_ptr = lds_base_ptr + lds_offset; + auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast(lds_ptr))); asm volatile("s_mov_b32 m0, %0; \n\t" "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), "v"(global_offset_bytes), @@ -2673,7 +2672,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, } #if defined(__gfx950__) -template +template __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) { #define __LDS_ADDR __attribute__((address_space(3))) diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 42f2390cde..28ded5439a 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -9,6 +9,8 @@ #include "ck_tile/core/config.hpp" #include "ck_tile/core/numeric/integer.hpp" #include "ck_tile/core/numeric/integral_constant.hpp" +#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp" +#include "ck_tile/core/arch/amd_buffer_addressing.hpp" #include "ck_tile/core/utility/ignore.hpp" #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111 @@ -104,7 +106,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant = {}) const index_t warp_id = threadIdx.x / get_warp_size(); if constexpr(ReturnSgpr) { - return __builtin_amdgcn_readfirstlane(warp_id); + return amd_wave_read_first_lane(warp_id); } else { diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index d1e770ef42..3b747dae84 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -875,10 +875,9 @@ struct buffer_view, t_per_x, addr_space>( - p_data_ + i + linear_offset); + constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; + return amd_transpose_load_to_vgpr, t_per_x>(p_data_ + i + + linear_offset); #else return X{numeric>::zero()}; #endif diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp index b45106487e..2db5d719c0 100644 --- a/include/ck_tile/core/tensor/tile_window.hpp +++ b/include/ck_tile/core/tensor/tile_window.hpp @@ -402,7 +402,7 @@ struct tile_window_with_static_distribution const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant{}); m0_set_with_memory( - __builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent + amd_wave_read_first_lane(m0_init_value)); // This should be wave independent using Traits = typename Base::Traits; diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp index ecd4e81b22..052ee4ae62 100644 --- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp +++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp @@ -92,13 +92,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); using XTensorType = decltype(cast_tile(load_tile(a_window))); auto square_sum = block_reduce2d.template MakeYBlockTile(); set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { const auto a = load_tile(a_window); const auto b = load_tile(b_window); @@ -149,7 +149,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass if constexpr(kSaveX) __syncthreads(); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto x = [&]() { if constexpr(kSaveX) @@ -226,7 +226,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass } move_tile_window(gamma_window, {Block_N}); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto x = [&]() { if constexpr(kSaveX) diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp index b0f48f6c5b..c99571562d 100644 --- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp +++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp @@ -84,9 +84,9 @@ struct BatchedTransposeKernel static constexpr ck_tile::index_t VectorSizeOutput = Problem::VectorSizeOutput; static constexpr ck_tile::index_t VectorStrideOutput = 1; - const auto iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); - const auto iN = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock); - const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width); + const auto iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock); + const auto iN = amd_wave_read_first_lane(blockIdx.y * kNPerBlock); + const auto offset = amd_wave_read_first_lane(blockIdx.z * kargs.height * kargs.width); const auto x_m_n = [&]() { const auto x_dram_naive = make_naive_tensor_view( diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 585a5f5b42..e0a39a5aea 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -9,25 +9,9 @@ #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" #include +#include namespace ck_tile { - -template -concept HasDataType = requires { typename T::DataType; }; - -template -struct GetDataType -{ - using type = float; -}; - -template - requires HasDataType -struct GetDataType -{ - using type = typename T::DataType; // Use T::ScaleN::DataType -}; - template + template CK_TILE_DEVICE void scale_tile(LdsTile& lds_tile, ScaleM& scale_m_window, ScaleN& scale_n_window) { @@ -334,7 +318,7 @@ struct CShuffleEpilogue constexpr index_t num_access = SFC::get_num_of_access(); if constexpr(iAccess != num_access - 1) { - constexpr auto step = SFC::get_forward_step(iAccess); + constexpr auto step = SFC::get_forward_step(number{}); move_tile_window(scale_m_window, {step.at(number<0>{}), step.at(number<1>{})}); move_tile_window(scale_n_window, {step.at(number<0>{}), step.at(number<1>{})}); @@ -342,10 +326,10 @@ struct CShuffleEpilogue } } - template + template CK_TILE_DEVICE void slice_acc_tile(const OAccTile& o_acc_tile, LdsTile& lds_tile) { - constexpr auto idx_y_start = SFC::get_index(iAccess); + constexpr auto idx_y_start = SFC::get_index(number{}); constexpr auto mIter = number{}) / (MPerIterationShuffle)>{}; constexpr auto nIter = number{}) / (NPerIterationShuffle)>{}; @@ -400,13 +384,13 @@ struct CShuffleEpilogue /** * @brief Move both the output and D tensors windows for the next access. */ - template + template CK_TILE_DEVICE void move_windows(OutDramWindow& out_dram_window, DDramWindows& d_dram_windows) { constexpr index_t num_access = SFC::get_num_of_access(); if constexpr(iAccess != num_access - 1) { - constexpr auto step = SFC::get_forward_step(iAccess); + constexpr auto step = SFC::get_forward_step(number{}); // move the output dram window move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})}); @@ -423,6 +407,18 @@ struct CShuffleEpilogue { }; + template + struct ScaleDataType + { + using DataType = float; + }; + + template + struct ScaleDataType> + { + using DataType = typename T::DataType; + }; + template && std::is_same_v; // Tiles to hold row/col scales when present - using SMType = typename GetDataType>::type; - using SNType = typename GetDataType>::type; + using SMType = typename ScaleDataType::DataType; + using SNType = typename ScaleDataType::DataType; auto sm_tile = make_static_distributed_tensor(dram_tile_distribution); auto sn_tile = make_static_distributed_tensor(dram_tile_distribution); diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp index a924279d52..ab0b310510 100644 --- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp +++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp @@ -598,8 +598,8 @@ struct FlatmmKernel CK_TILE_DEVICE void operator()(KernelArgs kargs) const { const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); // options diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp index fcd512056d..56865498c0 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp @@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_bias = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp index b234d6944e..327b41b071 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp @@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel // divide problem const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex(); - const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0); + const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; @@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel // divide problem const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex(); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0); long_index_t batch_offset_o = 0; long_index_t batch_offset_do = 0; @@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel // divide problem const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex(); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0); long_index_t batch_offset_dq = 0; long_index_t batch_offset_dq_acc = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp index 66f51459af..a82d121d62 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp @@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel // divide problem const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0); - const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0); + const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0); + const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0); const index_t i_cache_batch = [&, i_batch_ = i_batch] { if constexpr(kIsPagedKV) diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp index 29950435fa..ec8921b74c 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp @@ -1062,8 +1062,8 @@ struct FmhaFwdKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp index 58ef6ba87e..62ac70db92 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp @@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp index cf819c4b8d..a6fc0f1471 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp @@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_lse_acc = 0; long_index_t batch_offset_o_acc = 0; diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp index 9293c97a31..80de65ead4 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp @@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel // divide problem const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; // unused for paged-kvcache diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp index c5e5745817..abf9bf0aec 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp @@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel // divide problem const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs); - const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0); - const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1); + const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0); + const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1); long_index_t batch_offset_q = 0; long_index_t batch_offset_k = 0; diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp index 9d267e1cee..b01c127a21 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp @@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS k_block_tile = load_tile(k_dram_window); } auto physical_next_block_id_k = - __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id( + amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id( i_page_block_k, k_dram_block_window, {kN0, 0})); - auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane( + auto physical_next_block_id_v = amd_wave_read_first_lane( v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1})); if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 9de640b7cf..fe5e0bc345 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS k_block_tile = load_tile(k_dram_window); } auto physical_next_block_id_k = - __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id( + amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id( i_page_block_k, k_dram_block_window, {kN0, 0})); - auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane( + auto physical_next_block_id_v = amd_wave_read_first_lane( v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1})); if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS) @@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS &i_page_block_v_ = i_page_block_v, &v_dram_window_ = v_dram_window](auto i_k1) { auto physical_next_block_id_v_ = - __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id( + amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id( i_page_block_v_, v_dram_window_, {0, kK1})); const auto v = load_tile(v_dram_window_); // load next v block_sync_lds(); diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp index 6d95decaee..c69c15a2b0 100644 --- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp @@ -240,7 +240,7 @@ struct FusedMoeGemmKernel if constexpr(UseUK) { __shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()]; - IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + IndexDataType num_sorted_tiles = amd_wave_read_first_lane( *reinterpret_cast(kargs.num_sorted_tiles_ptr)); num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0; @@ -261,7 +261,7 @@ struct FusedMoeGemmKernel { // allocate LDS // __shared__ char smem_ptr[GetSmemSize()]; - IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane( + IndexDataType num_sorted_tiles = amd_wave_read_first_lane( *reinterpret_cast(kargs.num_sorted_tiles_ptr)); constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2; @@ -283,14 +283,14 @@ struct FusedMoeGemmKernel return; const IndexDataType expert_id = - __builtin_amdgcn_readfirstlane(reinterpret_cast( + amd_wave_read_first_lane(reinterpret_cast( kargs.sorted_expert_ids_ptr)[sorted_tile_id]); // index along intermediate_size // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id * // BlockShape::Block_N0); index_t interm_idx_nr = - __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0); + amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0); const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col] const auto sorted_token_id = diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp index faeb5cf6b3..28416ec538 100644 --- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp +++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp @@ -756,7 +756,7 @@ struct MoeSortingKernel void* smem) const { const index_t tid = static_cast(threadIdx.x); - const index_t wid = __builtin_amdgcn_readfirstlane(tid / get_warp_size()); + const index_t wid = amd_wave_read_first_lane(tid / get_warp_size()); const index_t lid = __lane_id(); constexpr index_t block_size = 256; // blockDim.x; const index_t sub_tokens = smem_rows - 2; // sub_tokens_mdiv.divisor; diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp index 38410721ae..d19f0894b9 100644 --- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp +++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp @@ -184,17 +184,17 @@ struct FusedMoeGemmPipeline_FlatmmUk index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1; index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1; - const IndexDataType expert_id = __builtin_amdgcn_readfirstlane( + const IndexDataType expert_id = amd_wave_read_first_lane( reinterpret_cast(kargs.sorted_expert_ids_ptr)[sorted_tile_id]); index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size; index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size; // nr*kr*w - index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane( + index_t interm_idx_nr0 = amd_wave_read_first_lane( intermediate_tile_id * BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W) - index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane( + index_t interm_idx_kr1 = amd_wave_read_first_lane( intermediate_tile_id * BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W) diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp index 588d903b25..6f9d53467f 100644 --- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp @@ -169,27 +169,27 @@ struct BatchedGemmKernel CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const { const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); - const auto i_batch = __builtin_amdgcn_readfirstlane(blockIdx.y); - const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z); + const auto i_batch = amd_wave_read_first_lane(blockIdx.y); + const auto i_splitk = amd_wave_read_first_lane(blockIdx.z); const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk); // options - const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A); - const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A); + const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A); + const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A); const ADataType* a_ptr = static_cast(kargs.as_ptr[0]) + batch_offset_A + splitk_batch_offset.as_k_split_offset[0]; - const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B); - const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B); + const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B); + const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B); const BDataType* b_ptr = static_cast(kargs.bs_ptr[0]) + batch_offset_B + splitk_batch_offset.bs_k_split_offset[0]; - const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E); - const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E); + const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E); + const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E); CDataType* c_ptr = static_cast(kargs.e_ptr) + batch_offset_C; // allocate LDS diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp index a891d4df55..673f5abc34 100644 --- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp +++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp @@ -73,8 +73,8 @@ struct GemmTile2DPartitioner CK_TILE_DEVICE static auto GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple { - const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx); - const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy); + const index_t iM = amd_wave_read_first_lane(blockIdx); + const index_t iN = amd_wave_read_first_lane(blockIdy); return make_tuple(iM, iN); } }; @@ -143,8 +143,8 @@ struct GemmTile1DPartitioner { const index_t NBlocks = integer_divide_ceil(N_, NPerBlock); - const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks); - const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks); + const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks); + const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks); return make_tuple(iM, iN); } diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index df1d6c9e4f..cf9ba31943 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -272,8 +272,8 @@ struct GroupedGemmKernel const auto [iM, iN] = block_idx_2d; - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z); @@ -358,8 +358,8 @@ struct GroupedGemmKernel const auto& d_block_window = gemm_tile_windows.at(Base::I2); // Get hot-loop and tail configuration - const index_t num_loop = __builtin_amdgcn_readfirstlane( - TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = + amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop); const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); @@ -416,8 +416,8 @@ struct GroupedGemmKernel const auto& d_block_window = gemm_tile_windows.at(Base::I2); // Get hot-loop and tail configuration - const index_t num_loop = __builtin_amdgcn_readfirstlane( - TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = + amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); // Run GEMM pipeline with compile-time branching diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp index 5df1f092d7..ad85b5392d 100644 --- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp @@ -271,8 +271,8 @@ struct StreamKKernel uint32_t block_idx = ck_tile::get_block_1d_id(); bool is_padding_block = - __builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks && - block_idx < kargs.tile_partitioner.dp_start_block_idx); + amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks && + block_idx < kargs.tile_partitioner.dp_start_block_idx); // Padding blocks make it such that the DP blocks are aligned with the number of CUs; they // should not partake in the GEMM @@ -289,7 +289,7 @@ struct StreamKKernel { // Determine the number of macro tiles in A and B this WG is resposible for in the // current C macro tile. - uint32_t current_iter_length = __builtin_amdgcn_readfirstlane( + uint32_t current_iter_length = amd_wave_read_first_lane( kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end)); // Determine the 1D tile_idx and the iter_offset for this WG. diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp index 8f44108cc4..51ad4e3dd1 100644 --- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp @@ -326,19 +326,19 @@ struct UniversalGemmKernel __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z) { constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); - const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); - const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1); + const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); + const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1); static_for<0, NumATensor, 1>{}([&](auto index) { using AiLayout = remove_cvref_t>; if constexpr(std::is_same_v) { - as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead); + as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead); } else if constexpr(std::is_same_v) { as_k_split_offset[index] = - __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]); + amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]); } }); @@ -347,21 +347,21 @@ struct UniversalGemmKernel if constexpr(std::is_same_v) { bs_k_split_offset[index] = - __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]); + amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]); } else if constexpr(std::is_same_v) { - bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead); + bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead); } }); if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = __builtin_amdgcn_readfirstlane(KRead); + splitted_k = amd_wave_read_first_lane(KRead); } else { - splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1)); + splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1)); } } @@ -970,8 +970,8 @@ struct UniversalGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = __builtin_amdgcn_readfirstlane( - TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = + amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& as_block_window = gemm_tile_windows.at(I0); @@ -1026,8 +1026,8 @@ struct UniversalGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = __builtin_amdgcn_readfirstlane( - TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = + amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& as_block_window = gemm_tile_windows.at(I0); @@ -1052,10 +1052,10 @@ struct UniversalGemmKernel template > CK_TILE_DEVICE void operator()(KernelArgs kargs) const { - const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x); + const auto blockId = amd_wave_read_first_lane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); @@ -1126,22 +1126,22 @@ struct UniversalGemmKernel template , typename = void> CK_TILE_DEVICE void operator()(KernelArgs kargs) const { - const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size()); + const auto grid_size = amd_wave_read_first_lane(get_grid_size()); const auto num_tiles = - __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N)); - const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch); - auto block_id = __builtin_amdgcn_readfirstlane(get_block_id()); + amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N)); + const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch); + auto block_id = amd_wave_read_first_lane(get_block_id()); while(block_id < num_work) { // Get the tile index for this block - const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles); + const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); // Get the SplitK offset for this block - const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles); + const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles); const SplitKBatchOffset splitk_batch_offset(kargs, k_batch); std::array as_ptr; diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index b362f751c6..d0466bc8b1 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -487,7 +487,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 if(HasHotLoop) { // minus 2 because we have ping-pong double buffer. - index_t iCounter = __builtin_amdgcn_readfirstlane(num_loop - 2); + index_t iCounter = amd_wave_read_first_lane(num_loop - 2); do { // ping diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp index 474d1a5a21..7263ddd5a1 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp @@ -178,7 +178,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 index_t warp_id = get_warp_id(); index_t operation_id = - __builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm + amd_wave_read_first_lane(get_warp_id()); // 0 - Memory read, 1 - block-gemm auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock); auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock); @@ -336,7 +336,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 MemoryOpsStep(warp_id); } - index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop); + index_t num_compute_steps = amd_wave_read_first_lane(num_loop); while(num_compute_steps > 1) { block_sync_lds(); diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp index 82bf75a9e3..0c9c816672 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp @@ -18,73 +18,64 @@ namespace ck_tile { namespace detail { // Helper templates for safe type extraction -template +template struct get_aq_layout_or { using type = Default; }; template - requires requires { typename T::AQLayout; } -struct get_aq_layout_or +struct get_aq_layout_or> { using type = typename T::AQLayout; }; -template +template struct get_bq_layout_or { using type = Default; }; template - requires requires { typename T::BQLayout; } -struct get_bq_layout_or +struct get_bq_layout_or> { using type = typename T::BQLayout; }; -template +template struct get_aq_data_type_or { using type = Default; }; template - requires requires { typename T::AQDataType; } -struct get_aq_data_type_or +struct get_aq_data_type_or> { using type = typename T::AQDataType; }; -template +template struct get_bq_data_type_or { using type = Default; }; template - requires requires { typename T::BQDataType; } -struct get_bq_data_type_or +struct get_bq_data_type_or> { using type = typename T::BQDataType; }; -template -concept HasStaticPreshuffleQuant = requires { - { T::PreshuffleQuant } -> std::convertible_to; -}; - -template +template struct is_quantpreshuffle_enabled { static constexpr bool value = false; }; -template -struct is_quantpreshuffle_enabled +template +struct is_quantpreshuffle_enabled { - static constexpr auto value = T::PreshuffleQuant; + static constexpr bool value = T::PreshuffleQuant; }; } // namespace detail @@ -270,34 +261,34 @@ struct QuantGemmKernel const std::size_t k_id = blockIdx.z) { constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(I2); - const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); - const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1); + const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); + const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1); if constexpr(std::is_same_v) { - a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); + a_k_split_offset = amd_wave_read_first_lane(k_id * KRead); } else if constexpr(std::is_same_v) { - a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A); + a_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_A); } if constexpr(std::is_same_v) { - b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B); + b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B); } else if constexpr(std::is_same_v) { - b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); + b_k_split_offset = amd_wave_read_first_lane(k_id * KRead); } if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = __builtin_amdgcn_readfirstlane(KRead); + splitted_k = amd_wave_read_first_lane(KRead); } else { - splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1)); + splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1)); } } @@ -918,8 +909,8 @@ struct QuantGemmKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = __builtin_amdgcn_readfirstlane( - TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); + const index_t num_loop = + amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -981,10 +972,10 @@ struct QuantGemmKernel CK_TILE_DEVICE void operator()(QuantGemmKernelArgs kargs) const { - const auto blockId = __builtin_amdgcn_readfirstlane(blockIdx.x); + const auto blockId = amd_wave_read_first_lane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); const SplitKBatchOffset splitk_batch_offset(kargs); // options diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp index 07c45117e2..39c8e406b7 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp @@ -305,8 +305,8 @@ struct QuantGroupedGemmKernel { const auto [iM, iN] = block_idx_2d; - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index 15e697afdf..e68a510a0c 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -840,7 +840,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum( + const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum( gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1))); // Run GEMM cooperatively by whole workgroup. @@ -891,7 +891,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = __builtin_amdgcn_readfirstlane( + const index_t num_loop = amd_wave_read_first_lane( TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1))); // Run GEMM cooperatively by whole workgroup. @@ -936,7 +936,7 @@ struct GroupedConvolutionBackwardDataKernel CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const { - const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); + const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); const index_t group_id = FindGroupId(kargs, blockIdX); const auto [iM, iN] = OffsettedTile1DPartitioner::GetOffsetedTileIndex( @@ -944,13 +944,13 @@ struct GroupedConvolutionBackwardDataKernel kargs.c_grid_descs_m_n[group_id].get_length(I0), kargs.c_grid_descs_m_n[group_id].get_length(I1)); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); - const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); - const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); + const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); + const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); // options // conv_bwd_data = Out * Weight = In diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 7bb3fedaf6..b85660aea3 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -423,22 +423,20 @@ struct GroupedConvolutionBackwardWeightKernel __device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs, const std::size_t k_id = blockIdx.z) { - constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); - const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1); - const index_t KRead = - __builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1); + constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}); + const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1); + const index_t KRead = amd_wave_read_first_lane((kargs.GemmK + K_t - 1) / K_t * K1); - a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); - b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead); + a_k_split_offset = amd_wave_read_first_lane(k_id * KRead); + b_k_split_offset = amd_wave_read_first_lane(k_id * KRead); if(k_id < static_cast(kargs.k_batch - 1)) { - splitted_k = __builtin_amdgcn_readfirstlane(KRead); + splitted_k = amd_wave_read_first_lane(KRead); } else { - splitted_k = - __builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1)); + splitted_k = amd_wave_read_first_lane(kargs.GemmK - KRead * (kargs.k_batch - 1)); } } @@ -805,22 +803,22 @@ struct GroupedConvolutionBackwardWeightKernel CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const { - const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); + const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); - const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z); - const index_t num_loop = __builtin_amdgcn_readfirstlane( + const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z); + const index_t num_loop = amd_wave_read_first_lane( ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock)); const index_t i_k = - __builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock); + amd_wave_read_first_lane(blockIdZ * num_loop * TilePartitioner::KPerBlock); - const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); - const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); + const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); + const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); // options // conv_bwd_weight = Out * In = Weight diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index d1eacd60cd..0363782d33 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -752,8 +752,7 @@ struct GroupedConvolutionForwardKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = - __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK)); + const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -802,8 +801,7 @@ struct GroupedConvolutionForwardKernel const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple); auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n); - const index_t num_loop = - __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK)); + const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK)); // Run GEMM cooperatively by whole workgroup. const auto& a_block_window = gemm_tile_windows.at(I0); @@ -822,22 +820,22 @@ struct GroupedConvolutionForwardKernel CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const { - const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x); + const auto blockIdX = amd_wave_read_first_lane(blockIdx.x); const auto [iM, iN] = TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX); - const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock); - const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock); + const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock); + const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock); - const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y); - const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY); - const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY); - const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY); + const auto blockIdY = amd_wave_read_first_lane(blockIdx.y); + const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY); + const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY); + const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY); // Split-N handling: Get which split this workgroup handles - const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z); + const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z); // Calculate batch offset for this split - const index_t batch_offset = __builtin_amdgcn_readfirstlane(blockIdZ * kargs.n_per_split); + const index_t batch_offset = amd_wave_read_first_lane(blockIdZ * kargs.n_per_split); // Calculate memory offsets for this split const long_index_t input_batch_offset = static_cast(batch_offset) * diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp index eb54807d88..bc20057e7a 100644 --- a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp +++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp @@ -175,9 +175,9 @@ struct ImageToColumn { const auto [M, K] = CalculateMKDims(kargs); - const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); - const index_t iK = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock); - const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z); + const index_t iM = amd_wave_read_first_lane(blockIdx.x * kMPerBlock); + const index_t iK = amd_wave_read_first_lane(blockIdx.y * kKPerBlock); + const index_t iBatch = amd_wave_read_first_lane(blockIdx.z); const auto in_offset = iBatch * kargs.image_g_n_c_wis_strides[I0]; const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0]; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index 0de1ada87c..422950b143 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -99,7 +99,7 @@ struct Layernorm2dFwdPipelineTwoPass // Problem::BlockShape static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); // total number of count assume current iter have no pad(only last iter has pad) constexpr index_t count_per_iter = @@ -119,7 +119,7 @@ struct Layernorm2dFwdPipelineTwoPass auto mean = block_norm_reduce.template MakeMeanVarBlockTile(); auto var = block_norm_reduce.template MakeMeanVarBlockTile(); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto x = load_tile(x_window); auto x_resi = load_tile(x_residual_window); @@ -197,7 +197,7 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(y_window, {0, stride_to_right_most_window}); // layernorm computation - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto acc = make_static_distributed_tensor( decltype(load_tile(x_window))::get_tile_distribution()); diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp index 92a71a42c8..83a22aaded 100644 --- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp +++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp @@ -156,7 +156,7 @@ struct Reduce const auto merged_reduce_len = transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{}); index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N)); + amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N)); auto block_reduce2d = Policy::template GetBlockReduce2d(); auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync(); @@ -167,7 +167,7 @@ struct Reduce auto y_compute = block_reduce2d.template MakeYBlockTile(); set_tile(y_compute, reduce_func.template GetIdentityValue()); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); block_reduce2d(x, y_compute, reduce_func); diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp index d01f37879a..ca3cdc37c4 100644 --- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp @@ -82,7 +82,7 @@ struct Rmsnorm2dFwdPipelineTwoPass // Problem::BlockShape static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); auto reduce_square_sum_func = ReduceOp::SquareAdd{}; auto reduce_sum_func = ReduceOp::Add{}; @@ -95,7 +95,7 @@ struct Rmsnorm2dFwdPipelineTwoPass auto square_sum = block_reduce2d.template MakeYBlockTile(); set_tile(square_sum, reduce_square_sum_func.GetIdentityValue()); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto x = load_tile(x_window); auto x_resi = load_tile(x_residual_window); @@ -151,7 +151,7 @@ struct Rmsnorm2dFwdPipelineTwoPass move_tile_window(y_window, {0, stride_to_right_most_window}); // rmsnorm computation - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { auto acc = make_static_distributed_tensor( decltype(load_tile(x_window))::get_tile_distribution()); diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp index 2553b19fd8..f6c7c0753a 100644 --- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp +++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp @@ -138,7 +138,7 @@ struct MoeSmoothquant const index_t i_topk = blockIdx.x; const index_t i_token = blockIdx.y * Block_M; const index_t i_token_in_thrd = - __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); + amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N); const index_t i_expert = reinterpret_cast( kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk]; diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp index ba9c6374f1..8b0a7274ed 100644 --- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp @@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass static constexpr index_t Block_N = Problem::BlockShape::Block_N; index_t num_n_tile_iteration = - __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N)); + amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N)); auto reduce_absmax_func = ReduceOp::AbsMax{}; auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) { @@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass auto absmax = block_reduce2d.template MakeYBlockTile(); set_tile(absmax, reduce_absmax_func.GetIdentityValue()); - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); const auto smscale = load_tile(smscale_window); @@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass move_tile_window(qy_window, {0, stride_to_right_most_window}); // recompute y and quantize y to qy - for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) + for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN) { const auto x = load_tile(x_window); const auto smscale = load_tile(smscale_window); diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp index 277049f6b0..e8727ea065 100644 --- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp +++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp @@ -96,9 +96,9 @@ struct TopkSoftmaxKernel if(block_row_id > kargs.num_rows) return; - index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input); - index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output); - index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id); + index_t block_os_inp = amd_wave_read_first_lane(block_row_id * kargs.stride_input); + index_t block_os_out = amd_wave_read_first_lane(block_row_id * kargs.stride_output); + index_t num_rows_rem = amd_wave_read_first_lane(kargs.num_rows - block_row_id); const auto input_window = [&]() { const InputType* p_input = diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc index e9a8ed74f2..33eb404fbe 100644 --- a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc +++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc @@ -1,6 +1,5 @@ #pragma once - TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x512x512) { constexpr int M = 512;