Fix for Add the API to load SGPR (#2913)

* Revert "Revert "[CK-Tile] Add the API to load SGPR (#2878)" (#2904)" This reverts commit f161b5b738. * Fix: sgpr minor issue * cyclic dependency resolved * clang formatted * removing unused variable * clang formatted --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2026-05-02 20:51:23 +00:00 · 2025-09-25 10:32:42 -07:00
parent 64e61b8647
commit b56e5d1d79
41 changed files with 224 additions and 173 deletions
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -240,7 +240,7 @@ struct FusedMoeGemmKernel
        if constexpr(UseUK)
        {
            __shared__ CK_TILE_LDS_ADDR char smem[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));

            num_sorted_tiles = num_sorted_tiles / BlockShape::Block_M0;
@@ -261,7 +261,7 @@ struct FusedMoeGemmKernel
        {
            // allocate LDS
            // __shared__ char smem_ptr[GetSmemSize()];
-            IndexDataType num_sorted_tiles = __builtin_amdgcn_readfirstlane(
+            IndexDataType num_sorted_tiles = amd_wave_read_first_lane(
                *reinterpret_cast<const IndexDataType*>(kargs.num_sorted_tiles_ptr));
            constexpr index_t hidden_radio_0 = IsGateOnly ? 1 : 2;

@@ -283,14 +283,14 @@ struct FusedMoeGemmKernel
                return;

            const IndexDataType expert_id =
-                __builtin_amdgcn_readfirstlane(reinterpret_cast<const IndexDataType*>(
+                amd_wave_read_first_lane(reinterpret_cast<const IndexDataType*>(
                    kargs.sorted_expert_ids_ptr)[sorted_tile_id]);

            // index along intermediate_size
            // index_t hidden_idx = __builtin_amdgcn_readfirstlane(intermediate_tile_id *
            // BlockShape::Block_N0);
            index_t interm_idx_nr =
-                __builtin_amdgcn_readfirstlane(intermediate_tile_id * BlockShape::Block_Nr0);
+                amd_wave_read_first_lane(intermediate_tile_id * BlockShape::Block_Nr0);

            const auto a_coord = Pipeline::GetACoord(); // 2d thread offset, [i_row, i_col]
            const auto sorted_token_id =
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -756,7 +756,7 @@ struct MoeSortingKernel
                                   void* smem) const
    {
        const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
+        const index_t wid            = amd_wave_read_first_lane(tid / get_warp_size());
        const index_t lid            = __lane_id();
        constexpr index_t block_size = 256;           // blockDim.x;
        const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;