[CK-Tile] Add the API to load SGPR (#2878)

* Have a workable version for SGPR * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb. * substitute with the new sgpr read api * update the CHANGELOG * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb. * change to static for logic * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb.
2026-05-03 05:01:25 +00:00 · 2025-09-23 01:23:56 -07:00
parent b6e8994386
commit 2cbbf5dcb3
40 changed files with 273 additions and 167 deletions
--- a/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
+++ b/include/ck_tile/ops/smoothquant/kernel/moe_smoothquant_kernel.hpp
@@ -138,7 +138,7 @@ struct MoeSmoothquant
        const index_t i_topk  = blockIdx.x;
        const index_t i_token = blockIdx.y * Block_M;
        const index_t i_token_in_thrd =
-            __builtin_amdgcn_readfirstlane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);
+            amd_wave_read_first_lane(threadIdx.x / Problem::BlockShape::ThreadPerBlock_N);

        const index_t i_expert = reinterpret_cast<const index_t*>(
            kargs.p_topk_ids)[(i_token + i_token_in_thrd) * kargs.topk + i_topk];
--- a/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
+++ b/include/ck_tile/ops/smoothquant/pipeline/smoothquant_pipeline_two_pass.hpp
@@ -57,7 +57,7 @@ struct SmoothquantPipelineTwoPass

        static constexpr index_t Block_N = Problem::BlockShape::Block_N;
        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(row_size, Block_N));
+            amd_wave_read_first_lane(integer_divide_ceil(row_size, Block_N));

        auto reduce_absmax_func  = ReduceOp::AbsMax{};
        auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
@@ -77,7 +77,7 @@ struct SmoothquantPipelineTwoPass
        auto absmax       = block_reduce2d.template MakeYBlockTile<XTensorType>();
        set_tile(absmax, reduce_absmax_func.GetIdentityValue<ComputeDataType>());

-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
        {
            const auto x       = load_tile(x_window);
            const auto smscale = load_tile(smscale_window);
@@ -121,7 +121,7 @@ struct SmoothquantPipelineTwoPass
        move_tile_window(qy_window, {0, stride_to_right_most_window});

        // recompute y and quantize y to qy
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
+        for(int iN = amd_wave_read_first_lane(0); iN < num_n_tile_iteration; ++iN)
        {
            const auto x       = load_tile(x_window);
            const auto smscale = load_tile(smscale_window);