From b8448ab68d00909f684afa7f0850d8f5c6987163 Mon Sep 17 00:00:00 2001
From: "assistant-librarian[bot]"
 <assistant-librarian[bot]@users.noreply.github.com>
Date: Thu, 25 Sep 2025 18:15:45 +0000
Subject: [PATCH] Merge commit '8c1a95991330118930f23e6a2ba8e76068d8ca22' into
 develop

---
 CHANGELOG.md                                  |  1 +
 Jenkinsfile                                   |  8 +--
 .../quant_run_grouped_gemm_example.inc        |  4 +-
 .../run_grouped_gemm_example.inc              |  4 +-
 .../21_elementwise/elementwise_example.cpp    |  4 +-
 .../elementwise_example_add_4d.cpp            |  4 +-
 .../elementwise_example_transpose.cpp         |  4 +-
 .../elementwise_example_unary.cpp             |  4 +-
 include/ck/utility/amd_ck_fp8.hpp             |  8 +--
 .../core/arch/amd_buffer_addressing.hpp       | 56 +++++++++++++++++-
 .../arch/amd_buffer_addressing_builtins.hpp   |  7 +--
 include/ck_tile/core/arch/arch.hpp            |  4 +-
 include/ck_tile/core/tensor/buffer_view.hpp   |  7 +--
 include/ck_tile/core/tensor/tile_window.hpp   |  2 +-
 ...norm2d_rdquant_fwd_pipeline_three_pass.hpp |  8 +--
 .../kernel/batched_transpose_kernel.hpp       |  6 +-
 .../ops/epilogue/cshuffle_epilogue.hpp        | 46 +++++++--------
 .../ops/flatmm/kernel/flatmm_kernel.hpp       |  4 +-
 .../fmha/kernel/fmha_batch_prefill_kernel.hpp |  4 +-
 .../ops/fmha/kernel/fmha_bwd_kernel.hpp       |  6 +-
 .../fmha/kernel/fmha_fwd_appendkv_kernel.hpp  |  4 +-
 .../ops/fmha/kernel/fmha_fwd_kernel.hpp       |  4 +-
 .../fmha/kernel/fmha_fwd_pagedkv_kernel.hpp   |  4 +-
 .../fmha_fwd_splitkv_combine_kernel.hpp       |  4 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |  4 +-
 .../ops/fmha/kernel/fmha_fwd_v3_kernel.hpp    |  4 +-
 ...ock_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp |  4 +-
 ...ock_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp |  6 +-
 .../fused_moe/kernel/fused_moegemm_kernel.hpp |  8 +--
 .../fused_moe/kernel/moe_sorting_kernel.hpp   |  2 +-
 .../fused_moegemm_pipeline_flatmm_uk.hpp      |  6 +-
 .../ops/gemm/kernel/batched_gemm_kernel.hpp   | 20 +++----
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  8 +--
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   | 12 ++--
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   |  6 +-
 .../ops/gemm/kernel/universal_gemm_kernel.hpp | 46 +++++++--------
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |  2 +-
 .../gemm_pipeline_ag_bg_cr_comp_v5.hpp        |  4 +-
 .../gemm_quant/kernel/gemm_quant_kernel.hpp   | 59 ++++++++-----------
 .../kernel/grouped_gemm_quant_kernel.hpp      |  4 +-
 ...ouped_convolution_backward_data_kernel.hpp | 18 +++---
 ...ped_convolution_backward_weight_kernel.hpp | 36 ++++++-----
 .../grouped_convolution_forward_kernel.hpp    | 24 ++++----
 .../kernel/image_to_column_kernel.hpp         |  6 +-
 .../layernorm2d_fwd_pipeline_two_pass.hpp     |  6 +-
 .../ops/reduce/kernel/reduce2d_kernel.hpp     |  4 +-
 .../rmsnorm2d_fwd_pipeline_two_pass.hpp       |  6 +-
 .../kernel/moe_smoothquant_kernel.hpp         |  2 +-
 .../smoothquant_pipeline_two_pass.hpp         |  6 +-
 .../kernel/topk_softmax_kernel.hpp            |  6 +-
 .../test_gemm_multi_abd_ut_cases_cshuffle.inc |  1 -
 51 files changed, 281 insertions(+), 236 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f21795012d..fe1e7ef345 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 
 ### Added
 * Added support for B Tensor type pk_int4_t in the CK TILE weight preshuffle GEMM.
+* Added the new api to load different memory sizes to SGPR.
 * Added support for B Tensor Preshuffle in CK TILE Grouped GEMM.
 * Added a basic copy kernel example and supporting documentation for new CK Tile developers.
 * Added support for bf16, f32, and f16 for 2D and 3D NGCHW grouped convolution backward data
diff --git a/Jenkinsfile b/Jenkinsfile
index 2866b7d84e..b18c2939dc 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -859,6 +859,7 @@ def run_aiter_tests(Map conf=[:]){
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_gemm_a8w8_blockscale.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha.py"
+                sh "python3 /home/jenkins/workspace/aiter/op_tests/test_mha_varlen.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_2stage.py"
                 sh "python3 /home/jenkins/workspace/aiter/op_tests/test_moe_blockscale.py"
@@ -930,7 +931,7 @@ def run_pytorch_tests(Map conf=[:]){
 }
 
 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_PERFORMANCE_TESTS=true
                                               0 22 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                               0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -1351,7 +1352,6 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx950") }
                     environment{
-                        def docker_name = "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1"
                         setup_args = "NO_CK_BUILD"
                         execute_args = """ ../script/cmake-ck-dev.sh  ../ gfx950 && \
                                            make -j128 tile_example_fmha_fwd tile_example_fmha_bwd && \
@@ -1359,7 +1359,7 @@ pipeline {
                                            example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx950 """
                     }
                     steps{
-                        buildHipClangJobAndReboot(setup_args:setup_args, docker_name: docker_name, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
+                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
                         cleanWs()
                     }
                 }
@@ -1567,7 +1567,7 @@ pipeline {
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                     }
                     steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB}:ck_ub24.04_rocm7.0.1", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                         cleanWs()
                     }
                 }
diff --git a/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc
index 658a4dfa62..10d317a2c7 100644
--- a/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc
@@ -183,7 +183,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
     if(!valid_input_data(group_count, Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs))
     {
         std::cout << "Please check the input data. Default values will be used." << std::endl;
-        
+
         // Clear existing (invalid) data before adding defaults
         Ms.clear();
         Ns.clear();
@@ -193,7 +193,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
         stride_Cs.clear();
         stride_AQs.clear();
         stride_BQs.clear();
-        
+
         for(int i = 0; i < group_count; i++)
         {
             Ms.push_back(256 + 256 * i);
diff --git a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
index 026f2bd8f6..b1aa832e72 100644
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -172,7 +172,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
         std::cout << "Default values: Ms (256, 512, 768, 1024..), Ns (256, 768, 1280..), Ks (512, "
                      "896, 1280..)"
                   << std::endl;
-        
+
         // Clear existing (invalid) data before adding defaults
         Ms.clear();
         Ns.clear();
@@ -180,7 +180,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
         stride_As.clear();
         stride_Bs.clear();
         stride_Cs.clear();
-        
+
         for(int i = 0; i < group_count; i++)
         {
             Ms.push_back(256 + 256 * i);
diff --git a/example/ck_tile/21_elementwise/elementwise_example.cpp b/example/ck_tile/21_elementwise/elementwise_example.cpp
index 94d3e70be1..e9fbeafde1 100644
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -211,7 +211,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
diff --git a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
index ff7ec1517e..1b101c2e5f 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -157,7 +157,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
diff --git a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
index 16e9832c07..7cdb5cc0d1 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -156,7 +156,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
diff --git a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
index c5a08d910e..4e19cfd688 100644
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -193,7 +193,9 @@ auto string_to_op(const std::string& op)
 
 int main(int argc, char* argv[])
 {
-    auto [result, arg_parser] = create_args(argc, argv);
+    bool result = true;
+    ck_tile::ArgParser arg_parser;
+    std::tie(result, arg_parser) = create_args(argc, argv);
     if(!result)
         return -1;
 
diff --git a/include/ck/utility/amd_ck_fp8.hpp b/include/ck/utility/amd_ck_fp8.hpp
index 2c00f4f42f..c5525d5ff8 100644
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -34,8 +34,8 @@ namespace ck {
 
 struct f8_fnuz_t
 {
-    using data_type = unsigned char;
-    data_type m_data;
+    using data_type  = unsigned char;
+    data_type m_data = data_type{};
     __host__ __device__ explicit constexpr f8_fnuz_t(data_type in_data) : m_data(in_data) {}
     __host__ __device__ explicit constexpr f8_fnuz_t() = default;
     __host__ __device__ bool constexpr operator==(f8_fnuz_t other) const
@@ -47,8 +47,8 @@ struct f8_fnuz_t
 
 struct bf8_fnuz_t
 {
-    using data_type = unsigned char;
-    data_type m_data;
+    using data_type  = unsigned char;
+    data_type m_data = data_type{};
     __host__ __device__ explicit constexpr bf8_fnuz_t(data_type in_data) : m_data(in_data) {}
     __host__ __device__ explicit constexpr bf8_fnuz_t() = default;
     __host__ __device__ bool constexpr operator==(bf8_fnuz_t other) const
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 7a9c017eb2..7bc5ca5df8 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2788,7 +2788,7 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
 }
 
 #if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
+template <typename T, index_t N>
 __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 #define __LDS_ADDR __attribute__((address_space(3)))
@@ -2829,6 +2829,60 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 }
 #endif
 
+// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
+// memory to the SGPR registers.
+__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
+{
+    return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
+}
+
+__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
+{
+    return __builtin_amdgcn_readfirstlane(value);
+}
+
+template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
+__device__ inline auto amd_wave_read_first_lane(const Object& obj)
+{
+    constexpr size_t ObjectSize = sizeof(Object);
+    constexpr size_t SGPR_size  = 4;
+    constexpr size_t NumFull    = ObjectSize / SGPR_size;
+    constexpr size_t Tail       = ObjectSize % SGPR_size;
+
+    const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
+    alignas(Object) unsigned char dst[ObjectSize];
+
+    static_for<0, NumFull, 1>{}([&](auto Ic) {
+        constexpr size_t offset = Ic * SGPR_size;
+        uint32_t read_src;
+        __builtin_memcpy(&read_src, src + offset, SGPR_size);
+        read_src = __builtin_amdgcn_readfirstlane(read_src);
+        __builtin_memcpy(dst + offset, &read_src, SGPR_size);
+    });
+
+    if constexpr(Tail != 0)
+    {
+        constexpr size_t offset = NumFull * SGPR_size;
+        uint32_t tail_loc       = 0;
+        __builtin_memcpy(&tail_loc, src + offset, Tail);
+        tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
+        __builtin_memcpy(dst + offset, &tail_loc, Tail);
+    }
+    Object out;
+    __builtin_memcpy(&out, dst, ObjectSize);
+    return out;
+}
+
 } // namespace ck_tile
 
 #endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 4e0a86119a..ce5a8075df 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -2639,9 +2639,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
     const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
 
 #if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
-    T* lds_ptr = lds_base_ptr + lds_offset;
-    auto const lds_ptr_sgpr =
-        __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
+    T* lds_ptr              = lds_base_ptr + lds_offset;
+    auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast<uintptr_t>(lds_ptr)));
     asm volatile("s_mov_b32 m0, %0; \n\t"
                  "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
                  "v"(global_offset_bytes),
@@ -2673,7 +2672,7 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
 }
 
 #if defined(__gfx950__)
-template <typename T, index_t N, address_space_enum BufferAddressSpace>
+template <typename T, index_t N>
 __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
 {
 #define __LDS_ADDR __attribute__((address_space(3)))
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 42f2390cde..28ded5439a 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -9,6 +9,8 @@
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/numeric/integral_constant.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
 
 #define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
@@ -104,7 +106,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
     const index_t warp_id = threadIdx.x / get_warp_size();
     if constexpr(ReturnSgpr)
     {
-        return __builtin_amdgcn_readfirstlane(warp_id);
+        return amd_wave_read_first_lane(warp_id);
     }
     else
     {
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index d1e770ef42..3b747dae84 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -875,10 +875,9 @@ struct buffer_view<address_space_enum::lds,
         if(is_valid_element)
         {
 #if defined(__gfx950__)
-            constexpr index_t t_per_x               = scalar_per_x_vector / scalar_per_t_vector;
-            constexpr address_space_enum addr_space = get_address_space();
-            return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x, addr_space>(
-                p_data_ + i + linear_offset);
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+            return amd_transpose_load_to_vgpr<remove_cvref_t<T>, t_per_x>(p_data_ + i +
+                                                                          linear_offset);
 #else
             return X{numeric<remove_cvref_t<T>>::zero()};
 #endif
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index b45106487e..2db5d719c0 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -402,7 +402,7 @@ struct tile_window_with_static_distribution
         const index_t m0_init_value =
             size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
         m0_set_with_memory(
-            __builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
+            amd_wave_read_first_lane(m0_init_value)); // This should be wave independent
 
         using Traits = typename Base::Traits;
 
diff --git a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
index ecd4e81b22..052ee4ae62 100644
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -92,13 +92,13 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
 
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         using XTensorType = decltype(cast_tile<ComputeDataType>(load_tile(a_window)));
         auto square_sum   = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto a = load_tile(a_window);
             const auto b = load_tile(b_window);
@@ -149,7 +149,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
         if constexpr(kSaveX)
             __syncthreads();
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x = [&]() {
                 if constexpr(kSaveX)
@@ -226,7 +226,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
         }
         move_tile_window(gamma_window, {Block_N});
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x = [&]() {
                 if constexpr(kSaveX)
diff --git a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
index b0f48f6c5b..c99571562d 100644
--- a/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
+++ b/include/ck_tile/ops/batched_transpose/kernel/batched_transpose_kernel.hpp
@@ -84,9 +84,9 @@ struct BatchedTransposeKernel
         static constexpr ck_tile::index_t VectorSizeOutput   = Problem::VectorSizeOutput;
         static constexpr ck_tile::index_t VectorStrideOutput = 1;
 
-        const auto iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const auto iN     = __builtin_amdgcn_readfirstlane(blockIdx.y * kNPerBlock);
-        const auto offset = __builtin_amdgcn_readfirstlane(blockIdx.z * kargs.height * kargs.width);
+        const auto iM     = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
+        const auto iN     = amd_wave_read_first_lane(blockIdx.y * kNPerBlock);
+        const auto offset = amd_wave_read_first_lane(blockIdx.z * kargs.height * kargs.width);
 
         const auto x_m_n = [&]() {
             const auto x_dram_naive = make_naive_tensor_view<address_space_enum::global>(
diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
index 585a5f5b42..e0a39a5aea 100644
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -9,25 +9,9 @@
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 
 #include <optional>
+#include <type_traits>
 
 namespace ck_tile {
-
-template <typename T>
-concept HasDataType = requires { typename T::DataType; };
-
-template <typename T>
-struct GetDataType
-{
-    using type = float;
-};
-
-template <typename T>
-    requires HasDataType<T>
-struct GetDataType<T>
-{
-    using type = typename T::DataType; // Use T::ScaleN::DataType
-};
-
 template <typename AsDataType_,
           typename BsDataType_,
           typename DsDataType_,
@@ -300,7 +284,7 @@ struct CShuffleEpilogue
         return MPerIterationShuffle * NPerIterationShuffle * sizeof(ODataType);
     }
 
-    template <auto iAccess, typename LdsTile, typename ScaleM, typename ScaleN>
+    template <index_t iAccess, typename LdsTile, typename ScaleM, typename ScaleN>
     CK_TILE_DEVICE void
     scale_tile(LdsTile& lds_tile, ScaleM& scale_m_window, ScaleN& scale_n_window)
     {
@@ -334,7 +318,7 @@ struct CShuffleEpilogue
             constexpr index_t num_access = SFC::get_num_of_access();
             if constexpr(iAccess != num_access - 1)
             {
-                constexpr auto step = SFC::get_forward_step(iAccess);
+                constexpr auto step = SFC::get_forward_step(number<iAccess>{});
 
                 move_tile_window(scale_m_window, {step.at(number<0>{}), step.at(number<1>{})});
                 move_tile_window(scale_n_window, {step.at(number<0>{}), step.at(number<1>{})});
@@ -342,10 +326,10 @@ struct CShuffleEpilogue
         }
     }
 
-    template <auto iAccess, typename OAccTile, typename LdsTile>
+    template <index_t iAccess, typename OAccTile, typename LdsTile>
     CK_TILE_DEVICE void slice_acc_tile(const OAccTile& o_acc_tile, LdsTile& lds_tile)
     {
-        constexpr auto idx_y_start = SFC::get_index(iAccess);
+        constexpr auto idx_y_start = SFC::get_index(number<iAccess>{});
 
         constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
         constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
@@ -400,13 +384,13 @@ struct CShuffleEpilogue
     /**
      * @brief Move both the output and D tensors windows for the next access.
      */
-    template <auto iAccess, typename OutDramWindow, typename DDramWindows>
+    template <index_t iAccess, typename OutDramWindow, typename DDramWindows>
     CK_TILE_DEVICE void move_windows(OutDramWindow& out_dram_window, DDramWindows& d_dram_windows)
     {
         constexpr index_t num_access = SFC::get_num_of_access();
         if constexpr(iAccess != num_access - 1)
         {
-            constexpr auto step = SFC::get_forward_step(iAccess);
+            constexpr auto step = SFC::get_forward_step(number<iAccess>{});
 
             // move the output dram window
             move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
@@ -423,6 +407,18 @@ struct CShuffleEpilogue
     {
     };
 
+    template <typename, typename = void>
+    struct ScaleDataType
+    {
+        using DataType = float;
+    };
+
+    template <typename T>
+    struct ScaleDataType<T, std::void_t<typename T::DataType>>
+    {
+        using DataType = typename T::DataType;
+    };
+
     template <typename ODramWindow,
               typename OAccTile,
               typename DsDramWindows,
@@ -475,8 +471,8 @@ struct CShuffleEpilogue
             std::is_same_v<ScaleM, AccDataType> && std::is_same_v<ScaleN, AccDataType>;
 
         // Tiles to hold row/col scales when present
-        using SMType = typename GetDataType<remove_cvref_t<ScaleM>>::type;
-        using SNType = typename GetDataType<remove_cvref_t<ScaleN>>::type;
+        using SMType = typename ScaleDataType<ScaleM>::DataType;
+        using SNType = typename ScaleDataType<ScaleN>::DataType;
 
         auto sm_tile = make_static_distributed_tensor<SMType>(dram_tile_distribution);
         auto sn_tile = make_static_distributed_tensor<SNType>(dram_tile_distribution);
diff --git a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
index a924279d52..ab0b310510 100644
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -598,8 +598,8 @@ struct FlatmmKernel
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
         // options
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
index fcd512056d..56865498c0 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_bias    = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
index b234d6944e..327b41b071 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel
         // divide problem
         const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_k       = 0;
@@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel
         // divide problem
         const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
         long_index_t batch_offset_o  = 0;
         long_index_t batch_offset_do = 0;
@@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel
         // divide problem
         const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);
 
         long_index_t batch_offset_dq     = 0;
         long_index_t batch_offset_dq_acc = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
index 66f51459af..a82d121d62 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
         // divide problem
         const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);
 
         const index_t i_cache_batch = [&, i_batch_ = i_batch] {
             if constexpr(kIsPagedKV)
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
index 29950435fa..ec8921b74c 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -1062,8 +1062,8 @@ struct FmhaFwdKernel
             // divide problem
             const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+            const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
             long_index_t batch_offset_q       = 0;
             long_index_t batch_offset_k       = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
index 58ef6ba87e..62ac70db92 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q    = 0;
         long_index_t batch_offset_k    = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
index cf819c4b8d..a6fc0f1471 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_lse_acc = 0;
         long_index_t batch_offset_o_acc   = 0;
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 9293c97a31..80de65ead4 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q       = 0;
         long_index_t batch_offset_k       = 0; // unused for paged-kvcache
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
index c5e5745817..abf9bf0aec 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
@@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel
         // divide problem
         const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);
 
-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);
 
         long_index_t batch_offset_q   = 0;
         long_index_t batch_offset_k   = 0;
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
index 9d267e1cee..b01c127a21 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                 k_block_tile = load_tile(k_dram_window);
             }
             auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                     i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
index 9de640b7cf..fe5e0bc345 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                 k_block_tile = load_tile(k_dram_window);
             }
             auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                     i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                 v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));
 
             if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
@@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                                                   &i_page_block_v_ = i_page_block_v,
                                                   &v_dram_window_  = v_dram_window](auto i_k1) {
                     auto physical_next_block_id_v_ =
-                        __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
+                        amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
                             i_page_block_v_, v_dram_window_, {0, kK1}));
                     const auto v = load_tile(v_dram_window_); // load next v
                     block_sync_lds();
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
index 6d95decaee..c69c15a2b0 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -240,7 +240,7 @@ struct FusedMoeGemmKernel
         if constexpr(UseUK)
         {
             __shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                 *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
 
             num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0;
@@ -261,7 +261,7 @@ struct FusedMoeGemmKernel
         {
             // allocate LDS
             // __shared__ char smem_ptr[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                 *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
             constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;
 
@@ -283,14 +283,14 @@ struct FusedMoeGemmKernel
                 return;
 
             const IndexDataType expert_id =
-                __builtin_amdgcn_readfirstlane(reinterpret_cast<const IndexDataType*>(
+                amd_wave_read_first_lane(reinterpret_cast<const IndexDataType*>(
                     kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
 
             // index along intermediate_size
             // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id *
             // BlockShape::Block_N0);
             index_t interm_idx_nr =
-                __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0);
+                amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0);
 
             const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col]
             const auto sorted_token_id =
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index faeb5cf6b3..28416ec538 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -756,7 +756,7 @@ struct MoeSortingKernel
                                    void* smem) const
     {
         const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
+        const index_t wid            = amd_wave_read_first_lane(tid / get_warp_size());
         const index_t lid            = __lane_id();
         constexpr index_t block_size = 256;           // blockDim.x;
         const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;
diff --git a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
index 38410721ae..d19f0894b9 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_uk.hpp
@@ -184,17 +184,17 @@ struct FusedMoeGemmPipeline_FlatmmUk
         index_t nr_1 = kargs.hidden_size / BlockShape::Warp_N1;
         index_t kr_1 = shared_intermediate_size_1 / BlockShape::Warp_K1;
 
-        const IndexDataType expert_id = __builtin_amdgcn_readfirstlane(
+        const IndexDataType expert_id = amd_wave_read_first_lane(
             reinterpret_cast<const IndexDataType*>(kargs.sorted_expert_ids_ptr)[sorted_tile_id]);
         index_t expert_stride_0 = shared_intermediate_size_0 * kargs.hidden_size;
         index_t expert_stride_1 = shared_intermediate_size_1 * kargs.hidden_size;
 
         // nr*kr*w
-        index_t interm_idx_nr0 = __builtin_amdgcn_readfirstlane(
+        index_t interm_idx_nr0 = amd_wave_read_first_lane(
             intermediate_tile_id *
             BlockShape::Block_Nr0); // intermediate_tile_id * Block_N / (N in W)
 
-        index_t interm_idx_kr1 = __builtin_amdgcn_readfirstlane(
+        index_t interm_idx_kr1 = amd_wave_read_first_lane(
             intermediate_tile_id *
             BlockShape::Block_Kr1); // intermediate_tile_id * Block_N / (N in W)
 
diff --git a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
index 588d903b25..6f9d53467f 100644
--- a/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp
@@ -169,27 +169,27 @@ struct BatchedGemmKernel
     CK_TILE_DEVICE void operator()(BatchedGemmKernelArgs kargs) const
     {
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto i_batch  = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto i_splitk = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const auto i_batch  = amd_wave_read_first_lane(blockIdx.y);
+        const auto i_splitk = amd_wave_read_first_lane(blockIdx.z);
 
         const typename UniversalGemmKernel::SplitKBatchOffset splitk_batch_offset(kargs, i_splitk);
 
         //  options
-        const auto batch_stride_A = __builtin_amdgcn_readfirstlane(kargs.batch_stride_A);
-        const auto batch_offset_A = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_A);
+        const auto batch_stride_A = amd_wave_read_first_lane(kargs.batch_stride_A);
+        const auto batch_offset_A = amd_wave_read_first_lane(i_batch * batch_stride_A);
         const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + batch_offset_A +
                                  splitk_batch_offset.as_k_split_offset[0];
 
-        const auto batch_stride_B = __builtin_amdgcn_readfirstlane(kargs.batch_stride_B);
-        const auto batch_offset_B = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_B);
+        const auto batch_stride_B = amd_wave_read_first_lane(kargs.batch_stride_B);
+        const auto batch_offset_B = amd_wave_read_first_lane(i_batch * batch_stride_B);
         const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + batch_offset_B +
                                  splitk_batch_offset.bs_k_split_offset[0];
 
-        const auto batch_stride_E = __builtin_amdgcn_readfirstlane(kargs.batch_stride_E);
-        const auto batch_offset_C = __builtin_amdgcn_readfirstlane(i_batch * batch_stride_E);
+        const auto batch_stride_E = amd_wave_read_first_lane(kargs.batch_stride_E);
+        const auto batch_offset_C = amd_wave_read_first_lane(i_batch * batch_stride_E);
         CDataType* c_ptr          = static_cast<CDataType*>(kargs.e_ptr) + batch_offset_C;
 
         // allocate LDS
diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index a891d4df55..673f5abc34 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -73,8 +73,8 @@ struct GemmTile2DPartitioner
     CK_TILE_DEVICE static auto
     GetOutputTileIndex(index_t blockIdx, index_t blockIdy) noexcept -> const tuple<index_t, index_t>
     {
-        const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx);
-        const index_t iN = __builtin_amdgcn_readfirstlane(blockIdy);
+        const index_t iM = amd_wave_read_first_lane(blockIdx);
+        const index_t iN = amd_wave_read_first_lane(blockIdy);
         return make_tuple(iM, iN);
     }
 };
@@ -143,8 +143,8 @@ struct GemmTile1DPartitioner
     {
         const index_t NBlocks = integer_divide_ceil(N_, NPerBlock);
 
-        const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx / NBlocks);
-        const index_t iN = __builtin_amdgcn_readfirstlane(blockIdx - iM * NBlocks);
+        const index_t iM = amd_wave_read_first_lane(blockIdx / NBlocks);
+        const index_t iN = amd_wave_read_first_lane(blockIdx - iM * NBlocks);
         return make_tuple(iM, iN);
     }
 
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index df1d6c9e4f..cf9ba31943 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -272,8 +272,8 @@ struct GroupedGemmKernel
 
         const auto [iM, iN] = block_idx_2d;
 
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
 
@@ -358,8 +358,8 @@ struct GroupedGemmKernel
         const auto& d_block_window = gemm_tile_windows.at(Base::I2);
 
         // Get hot-loop and tail configuration
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
         const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
@@ -416,8 +416,8 @@ struct GroupedGemmKernel
         const auto& d_block_window = gemm_tile_windows.at(Base::I2);
 
         // Get hot-loop and tail configuration
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
         const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
 
         // Run GEMM pipeline with compile-time branching
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 5df1f092d7..ad85b5392d 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -271,8 +271,8 @@ struct StreamKKernel
         uint32_t block_idx = ck_tile::get_block_1d_id();
 
         bool is_padding_block =
-            __builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
-                                           block_idx < kargs.tile_partitioner.dp_start_block_idx);
+            amd_wave_read_first_lane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
+                                     block_idx < kargs.tile_partitioner.dp_start_block_idx);
 
         // Padding blocks make it such that the DP blocks are aligned with the number of CUs; they
         // should not partake in the GEMM
@@ -289,7 +289,7 @@ struct StreamKKernel
         {
             // Determine the number of macro tiles in A and B this WG is resposible for in the
             // current C macro tile.
-            uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
+            uint32_t current_iter_length = amd_wave_read_first_lane(
                 kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end));
 
             // Determine the 1D tile_idx and the iter_offset for this WG.
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 8f44108cc4..51ad4e3dd1 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -326,19 +326,19 @@ struct UniversalGemmKernel
         __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
 
             static_for<0, NumATensor, 1>{}([&](auto index) {
                 using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
                 {
-                    as_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                    as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
                 {
                     as_k_split_offset[index] =
-                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_As[index]);
+                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);
                 }
             });
 
@@ -347,21 +347,21 @@ struct UniversalGemmKernel
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
                 {
                     bs_k_split_offset[index] =
-                        __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_Bs[index]);
+                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
                 {
-                    bs_k_split_offset[index] = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                    bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
                 }
             });
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+                splitted_k = amd_wave_read_first_lane(KRead);
             }
             else
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
+                splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -970,8 +970,8 @@ struct UniversalGemmKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& as_block_window = gemm_tile_windows.at(I0);
@@ -1026,8 +1026,8 @@ struct UniversalGemmKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& as_block_window = gemm_tile_windows.at(I0);
@@ -1052,10 +1052,10 @@ struct UniversalGemmKernel
     template <bool U = !PersistentKernel, typename = std::enable_if_t<U>>
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockId  = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
 
@@ -1126,22 +1126,22 @@ struct UniversalGemmKernel
     template <bool U = PersistentKernel, typename = std::enable_if_t<U>, typename = void>
     CK_TILE_DEVICE void operator()(KernelArgs kargs) const
     {
-        const auto grid_size = __builtin_amdgcn_readfirstlane(get_grid_size());
+        const auto grid_size = amd_wave_read_first_lane(get_grid_size());
         const auto num_tiles =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GridSize(kargs.M, kargs.N));
-        const auto num_work = __builtin_amdgcn_readfirstlane(num_tiles * kargs.k_batch);
-        auto block_id       = __builtin_amdgcn_readfirstlane(get_block_id());
+            amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
+        const auto num_work = amd_wave_read_first_lane(num_tiles * kargs.k_batch);
+        auto block_id       = amd_wave_read_first_lane(get_block_id());
 
         while(block_id < num_work)
         {
             // Get the tile index for this block
-            const auto tile_idx = __builtin_amdgcn_readfirstlane(block_id % num_tiles);
+            const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
             const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);
-            const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-            const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+            const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
             // Get the SplitK offset for this block
-            const auto k_batch = __builtin_amdgcn_readfirstlane(block_id / num_tiles);
+            const auto k_batch = amd_wave_read_first_lane(block_id / num_tiles);
             const SplitKBatchOffset splitk_batch_offset(kargs, k_batch);
 
             std::array<const ADataType*, NumATensor> as_ptr;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index b362f751c6..d0466bc8b1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -487,7 +487,7 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
             if(HasHotLoop)
             {
                 // minus 2 because we have ping-pong double buffer.
-                index_t iCounter = __builtin_amdgcn_readfirstlane(num_loop - 2);
+                index_t iCounter = amd_wave_read_first_lane(num_loop - 2);
                 do
                 {
                     // ping
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 474d1a5a21..7263ddd5a1 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -178,7 +178,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
 
             index_t warp_id = get_warp_id();
             index_t operation_id =
-                __builtin_amdgcn_readfirstlane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
+                amd_wave_read_first_lane(get_warp_id()); // 0 - Memory read, 1 - block-gemm
 
             auto a_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
             auto b_offset = (warp_id == 0) ? make_array(0, 0) : make_array(0, KPerBlock);
@@ -336,7 +336,7 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
                 MemoryOpsStep(warp_id);
             }
 
-            index_t num_compute_steps = __builtin_amdgcn_readfirstlane(num_loop);
+            index_t num_compute_steps = amd_wave_read_first_lane(num_loop);
             while(num_compute_steps > 1)
             {
                 block_sync_lds();
diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
index 82bf75a9e3..0c9c816672 100644
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -18,73 +18,64 @@ namespace ck_tile {
 
 namespace detail {
 // Helper templates for safe type extraction
-template <typename T, typename Default>
+template <typename, typename Default, typename = void>
 struct get_aq_layout_or
 {
     using type = Default;
 };
 
 template <typename T, typename Default>
-    requires requires { typename T::AQLayout; }
-struct get_aq_layout_or<T, Default>
+struct get_aq_layout_or<T, Default, std::void_t<typename T::AQLayout>>
 {
     using type = typename T::AQLayout;
 };
 
-template <typename T, typename Default>
+template <typename, typename Default, typename = void>
 struct get_bq_layout_or
 {
     using type = Default;
 };
 
 template <typename T, typename Default>
-    requires requires { typename T::BQLayout; }
-struct get_bq_layout_or<T, Default>
+struct get_bq_layout_or<T, Default, std::void_t<typename T::BQLayout>>
 {
     using type = typename T::BQLayout;
 };
 
-template <typename T, typename Default>
+template <typename, typename Default, typename = void>
 struct get_aq_data_type_or
 {
     using type = Default;
 };
 
 template <typename T, typename Default>
-    requires requires { typename T::AQDataType; }
-struct get_aq_data_type_or<T, Default>
+struct get_aq_data_type_or<T, Default, std::void_t<typename T::AQDataType>>
 {
     using type = typename T::AQDataType;
 };
 
-template <typename T, typename Default>
+template <typename, typename Default, typename = void>
 struct get_bq_data_type_or
 {
     using type = Default;
 };
 
 template <typename T, typename Default>
-    requires requires { typename T::BQDataType; }
-struct get_bq_data_type_or<T, Default>
+struct get_bq_data_type_or<T, Default, std::void_t<typename T::BQDataType>>
 {
     using type = typename T::BQDataType;
 };
 
-template <typename T>
-concept HasStaticPreshuffleQuant = requires {
-    { T::PreshuffleQuant } -> std::convertible_to<decltype(T::PreshuffleQuant)>;
-};
-
-template <typename T>
+template <typename, typename = void>
 struct is_quantpreshuffle_enabled
 {
     static constexpr bool value = false;
 };
 
-template <HasStaticPreshuffleQuant T>
-struct is_quantpreshuffle_enabled<T>
+template <typename T>
+struct is_quantpreshuffle_enabled<T, decltype(T::PreshuffleQuant)>
 {
-    static constexpr auto value = T::PreshuffleQuant;
+    static constexpr bool value = T::PreshuffleQuant;
 };
 } // namespace detail
 
@@ -270,34 +261,34 @@ struct QuantGemmKernel
                                      const std::size_t k_id = blockIdx.z)
         {
             constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
-            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
 
             if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
             {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
             }
             else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
             {
-                a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_A);
+                a_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_A);
             }
 
             if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
             {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead * kargs.stride_B);
+                b_k_split_offset = amd_wave_read_first_lane(k_id * KRead * kargs.stride_B);
             }
             else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
             {
-                b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+                b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
             }
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+                splitted_k = amd_wave_read_first_lane(KRead);
             }
             else
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(kargs.K - KRead * (kargs.k_batch - 1));
+                splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -918,8 +909,8 @@ struct QuantGemmKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
-            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
+        const index_t num_loop =
+            amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -981,10 +972,10 @@ struct QuantGemmKernel
 
     CK_TILE_DEVICE void operator()(QuantGemmKernelArgs kargs) const
     {
-        const auto blockId  = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockId  = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const SplitKBatchOffset splitk_batch_offset(kargs);
         // options
diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
index 07c45117e2..39c8e406b7 100644
--- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
@@ -305,8 +305,8 @@ struct QuantGroupedGemmKernel
     {
         const auto [iM, iN] = block_idx_2d;
 
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
         const typename Base::SplitKBatchOffset splitk_batch_offset(kargs, block_idx_z);
 
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
index 15e697afdf..e68a510a0c 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp
@@ -840,7 +840,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(
             gemm_pad_views.at(I0).get_tensor_descriptor().get_length(I1)));
 
         // Run GEMM cooperatively by whole workgroup.
@@ -891,7 +891,7 @@ struct GroupedConvolutionBackwardDataKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+        const index_t num_loop = amd_wave_read_first_lane(
             TilePartitioner::GetLoopNum(gemm_tile_windows.at(I0).get_length(I1)));
 
         // Run GEMM cooperatively by whole workgroup.
@@ -936,7 +936,7 @@ struct GroupedConvolutionBackwardDataKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvBwdDataKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX    = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX    = amd_wave_read_first_lane(blockIdx.x);
         const index_t group_id = FindGroupId(kargs, blockIdX);
 
         const auto [iM, iN] = OffsettedTile1DPartitioner<TilePartitioner>::GetOffsetedTileIndex(
@@ -944,13 +944,13 @@ struct GroupedConvolutionBackwardDataKernel
             kargs.c_grid_descs_m_n[group_id].get_length(I0),
             kargs.c_grid_descs_m_n[group_id].get_length(I1));
 
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
         // options
         // conv_bwd_data = Out * Weight = In
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 7bb3fedaf6..b85660aea3 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -423,22 +423,20 @@ struct GroupedConvolutionBackwardWeightKernel
         __device__ SplitKBatchOffset(const GroupedConvBwdWeightKernelArgsSpecialized& kargs,
                                      const std::size_t k_id = blockIdx.z)
         {
-            constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
-            const index_t KRead =
-                __builtin_amdgcn_readfirstlane((kargs.GemmK + K_t - 1) / K_t * K1);
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
+            const index_t KRead = amd_wave_read_first_lane((kargs.GemmK + K_t - 1) / K_t * K1);
 
-            a_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
-            b_k_split_offset = __builtin_amdgcn_readfirstlane(k_id * KRead);
+            a_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
+            b_k_split_offset = amd_wave_read_first_lane(k_id * KRead);
 
             if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
             {
-                splitted_k = __builtin_amdgcn_readfirstlane(KRead);
+                splitted_k = amd_wave_read_first_lane(KRead);
             }
             else
             {
-                splitted_k =
-                    __builtin_amdgcn_readfirstlane(kargs.GemmK - KRead * (kargs.k_batch - 1));
+                splitted_k = amd_wave_read_first_lane(kargs.GemmK - KRead * (kargs.k_batch - 1));
             }
         }
 
@@ -805,22 +803,22 @@ struct GroupedConvolutionBackwardWeightKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] =
             TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdZ    = __builtin_amdgcn_readfirstlane(blockIdx.z);
-        const index_t num_loop = __builtin_amdgcn_readfirstlane(
+        const auto blockIdZ    = amd_wave_read_first_lane(blockIdx.z);
+        const index_t num_loop = amd_wave_read_first_lane(
             ck_tile::integer_divide_ceil(kargs.GemmK, kargs.k_batch * TilePartitioner::KPerBlock));
         const index_t i_k =
-            __builtin_amdgcn_readfirstlane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
+            amd_wave_read_first_lane(blockIdZ * num_loop * TilePartitioner::KPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
         // options
         // conv_bwd_weight = Out * In = Weight
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index d1eacd60cd..0363782d33 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -752,8 +752,7 @@ struct GroupedConvolutionForwardKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -802,8 +801,7 @@ struct GroupedConvolutionForwardKernel
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
 
-        const index_t num_loop =
-            __builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
+        const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(kargs.GemmK));
 
         // Run GEMM cooperatively by whole workgroup.
         const auto& a_block_window = gemm_tile_windows.at(I0);
@@ -822,22 +820,22 @@ struct GroupedConvolutionForwardKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
     {
-        const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
+        const auto blockIdX = amd_wave_read_first_lane(blockIdx.x);
         const auto [iM, iN] =
             TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
-        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+        const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
 
-        const auto blockIdY       = __builtin_amdgcn_readfirstlane(blockIdx.y);
-        const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
-        const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
-        const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
+        const auto blockIdY       = amd_wave_read_first_lane(blockIdx.y);
+        const auto group_offset_a = amd_wave_read_first_lane(kargs.group_stride_a * blockIdY);
+        const auto group_offset_b = amd_wave_read_first_lane(kargs.group_stride_b * blockIdY);
+        const auto group_offset_c = amd_wave_read_first_lane(kargs.group_stride_c * blockIdY);
 
         // Split-N handling: Get which split this workgroup handles
-        const auto blockIdZ = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const auto blockIdZ = amd_wave_read_first_lane(blockIdx.z);
 
         // Calculate batch offset for this split
-        const index_t batch_offset = __builtin_amdgcn_readfirstlane(blockIdZ * kargs.n_per_split);
+        const index_t batch_offset = amd_wave_read_first_lane(blockIdZ * kargs.n_per_split);
 
         // Calculate memory offsets for this split
         const long_index_t input_batch_offset = static_cast<long_index_t>(batch_offset) *
diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
index eb54807d88..bc20057e7a 100644
--- a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
+++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp
@@ -175,9 +175,9 @@ struct ImageToColumn
     {
         const auto [M, K] = CalculateMKDims(kargs);
 
-        const index_t iM     = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock);
-        const index_t iK     = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock);
-        const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z);
+        const index_t iM     = amd_wave_read_first_lane(blockIdx.x * kMPerBlock);
+        const index_t iK     = amd_wave_read_first_lane(blockIdx.y * kKPerBlock);
+        const index_t iBatch = amd_wave_read_first_lane(blockIdx.z);
 
         const auto in_offset  = iBatch * kargs.image_g_n_c_wis_strides[I0];
         const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0];
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
index 0de1ada87c..422950b143 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp
@@ -99,7 +99,7 @@ struct Layernorm2dFwdPipelineTwoPass
         // Problem::BlockShape
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         // total number of count assume current iter have no pad(only last iter has pad)
         constexpr index_t count_per_iter =
@@ -119,7 +119,7 @@ struct Layernorm2dFwdPipelineTwoPass
         auto mean         = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
         auto var          = block_norm_reduce.template MakeMeanVarBlockTile<XTensorType>();
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x            = load_tile(x_window);
             auto x_resi       = load_tile(x_residual_window);
@@ -197,7 +197,7 @@ struct Layernorm2dFwdPipelineTwoPass
         move_tile_window(y_window, {0, stride_to_right_most_window});
 
         // layernorm computation
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto acc = make_static_distributed_tensor<ComputeDataType>(
                 decltype(load_tile(x_window))::get_tile_distribution());
diff --git a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
index 92a71a42c8..83a22aaded 100644
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -156,7 +156,7 @@ struct Reduce
         const auto merged_reduce_len =
             transformed_x_tensor.get_tensor_descriptor().get_lengths().at(number<1>{});
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(merged_reduce_len, S::Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(merged_reduce_len, S::Block_N));
 
         auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
         auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
@@ -167,7 +167,7 @@ struct Reduce
         auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x = load_tile(x_window);
             block_reduce2d(x, y_compute, reduce_func);
diff --git a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
index d01f37879a..ca3cdc37c4 100644
--- a/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/rmsnorm2d/pipeline/rmsnorm2d_fwd_pipeline_two_pass.hpp
@@ -82,7 +82,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         // Problem::BlockShape
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         auto reduce_square_sum_func = ReduceOp::SquareAdd{};
         auto reduce_sum_func        = ReduceOp::Add{};
@@ -95,7 +95,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         auto square_sum         = block_reduce2d.template MakeYBlockTile<ComputeTensorType>();
         set_tile(square_sum, reduce_square_sum_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto x      = load_tile(x_window);
             auto x_resi = load_tile(x_residual_window);
@@ -151,7 +151,7 @@ struct Rmsnorm2dFwdPipelineTwoPass
         move_tile_window(y_window, {0, stride_to_right_most_window});
 
         // rmsnorm computation
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             auto acc = make_static_distributed_tensor<ComputeDataType>(
                 decltype(load_tile(x_window))::get_tile_distribution());
diff --git a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
index 2553b19fd8..f6c7c0753a 100644
--- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
@@ -138,7 +138,7 @@ struct MoeSmoothquant
         const index_t i_topk  = blockIdx.x;
         const index_t i_token = blockIdx.y * Block_M;
         const index_t i_token_in_thrd =
-            __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
+            amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
 
         const index_t i_expert = reinterpret_cast<const index_t*>(
             kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
diff --git a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
index ba9c6374f1..8b0a7274ed 100644
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass
 
         static constexpr index_t Block_N = Problem::BlockShape::Block_N;
         index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));
 
         auto reduce_absmax_func  = ReduceOp::AbsMax{};
         auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
@@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass
         auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
         set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());
 
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x       = load_tile(x_window);
             const auto smscale = load_tile(smscale_window);
@@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass
         move_tile_window(qy_window, {0, stride_to_right_most_window});
 
         // recompute y and quantize y to qy
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
         {
             const auto x       = load_tile(x_window);
             const auto smscale = load_tile(smscale_window);
diff --git a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
index 277049f6b0..e8727ea065 100644
--- a/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
+++ b/include/ck_tile/ops/topk_softmax/kernel/topk_softmax_kernel.hpp
@@ -96,9 +96,9 @@ struct TopkSoftmaxKernel
         if(block_row_id > kargs.num_rows)
             return;
 
-        index_t block_os_inp = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_input);
-        index_t block_os_out = __builtin_amdgcn_readfirstlane(block_row_id * kargs.stride_output);
-        index_t num_rows_rem = __builtin_amdgcn_readfirstlane(kargs.num_rows - block_row_id);
+        index_t block_os_inp = amd_wave_read_first_lane(block_row_id * kargs.stride_input);
+        index_t block_os_out = amd_wave_read_first_lane(block_row_id * kargs.stride_output);
+        index_t num_rows_rem = amd_wave_read_first_lane(kargs.num_rows - block_row_id);
 
         const auto input_window = [&]() {
             const InputType* p_input =
diff --git a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc
index e9a8ed74f2..33eb404fbe 100644
--- a/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc
+++ b/test/ck_tile/gemm_multi_abd/test_gemm_multi_abd_ut_cases_cshuffle.inc
@@ -1,6 +1,5 @@
 #pragma once
 
-
 TYPED_TEST(TestCkTileGemmMultiABD, TestCkTileGemmMultiABDKBatch2CShuffle_512x512x512)
 {
     constexpr int M      = 512;