Fix for Add the API to load SGPR (#2913)

* Revert "Revert "[CK-Tile] Add the API to load SGPR (#2878)" (#2904)" This reverts commit f161b5b738. * Fix: sgpr minor issue * cyclic dependency resolved * clang formatted * removing unused variable * clang formatted --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2026-05-05 22:22:27 +00:00 · 2025-09-25 10:32:42 -07:00
parent 64e61b8647
commit b56e5d1d79
41 changed files with 224 additions and 173 deletions
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -707,8 +707,8 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel
        // divide problem
        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

        long_index_t batch_offset_q       = 0;
        long_index_t batch_offset_bias    = 0;
--- a/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp
@@ -690,7 +690,7 @@ struct FmhaBwdDQDKDVKernel
        // divide problem
        const auto [i_tile_n, i_nhead, i_batch] = GetTileIndex();

-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN0);

        long_index_t batch_offset_q       = 0;
        long_index_t batch_offset_k       = 0;
@@ -1338,7 +1338,7 @@ struct FmhaBwdOGradDotOKernel
        // divide problem
        const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);

        long_index_t batch_offset_o  = 0;
        long_index_t batch_offset_do = 0;
@@ -1618,7 +1618,7 @@ struct FmhaBwdConvertQGradKernel
        // divide problem
        const auto [i_tile_m, i_nhead, i_batch] = GetTileIndex();

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * kM0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * kM0);

        long_index_t batch_offset_dq     = 0;
        long_index_t batch_offset_dq_acc = 0;
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp
@@ -262,8 +262,8 @@ struct FmhaFwdAppendKVKernel
        // divide problem
        const auto [i_tile, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kM0);
-        const index_t i_n0 = __builtin_amdgcn_readfirstlane(i_tile * FmhaPipeline::kN0);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kM0);
+        const index_t i_n0 = amd_wave_read_first_lane(i_tile * FmhaPipeline::kN0);

        const index_t i_cache_batch = [&, i_batch_ = i_batch] {
            if constexpr(kIsPagedKV)
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -1062,8 +1062,8 @@ struct FmhaFwdKernel
            // divide problem
            const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);

-            const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-            const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+            const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+            const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

            long_index_t batch_offset_q       = 0;
            long_index_t batch_offset_k       = 0;
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_pagedkv_kernel.hpp
@@ -880,8 +880,8 @@ struct FmhaFwdPagedKVKernel
        // divide problem
        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

        long_index_t batch_offset_q    = 0;
        long_index_t batch_offset_k    = 0;
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_combine_kernel.hpp
@@ -281,8 +281,8 @@ struct FmhaFwdSplitKVCombineKernel
        // divide problem
        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

        long_index_t batch_offset_lse_acc = 0;
        long_index_t batch_offset_o_acc   = 0;
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -589,8 +589,8 @@ struct FmhaFwdSplitKVKernel
        // divide problem
        const auto [i_tile_m, i_tile_n, i_split, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

        long_index_t batch_offset_q       = 0;
        long_index_t batch_offset_k       = 0; // unused for paged-kvcache
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp
@@ -361,8 +361,8 @@ struct FmhaFwdV3Kernel
        // divide problem
        const auto [i_tile_m, i_tile_n, i_nhead, i_batch] = GetTileIndex(kargs);

-        const index_t i_m0 = __builtin_amdgcn_readfirstlane(i_tile_m * FmhaPipeline::kM0);
-        const index_t i_n1 = __builtin_amdgcn_readfirstlane(i_tile_n * FmhaPipeline::kN1);
+        const index_t i_m0 = amd_wave_read_first_lane(i_tile_m * FmhaPipeline::kM0);
+        const index_t i_n1 = amd_wave_read_first_lane(i_tile_n * FmhaPipeline::kN1);

        long_index_t batch_offset_q   = 0;
        long_index_t batch_offset_k   = 0;
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_pagedkv_pipeline_qr_ks_vs.hpp
@@ -320,9 +320,9 @@ struct BlockFmhaFwdPagedKVPipelineQRKSVS
                k_block_tile = load_tile(k_dram_window);
            }
            auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                    i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));

            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -321,9 +321,9 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                k_block_tile = load_tile(k_dram_window);
            }
            auto physical_next_block_id_k =
-                __builtin_amdgcn_readfirstlane(k_page_block_navigator.prefetch_table_id(
+                amd_wave_read_first_lane(k_page_block_navigator.prefetch_table_id(
                    i_page_block_k, k_dram_block_window, {kN0, 0}));
-            auto physical_next_block_id_v = __builtin_amdgcn_readfirstlane(
+            auto physical_next_block_id_v = amd_wave_read_first_lane(
                v_page_block_navigator.prefetch_table_id(i_page_block_v, v_dram_window, {0, kK1}));

            if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
@@ -618,7 +618,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
                                                  &i_page_block_v_ = i_page_block_v,
                                                  &v_dram_window_  = v_dram_window](auto i_k1) {
                    auto physical_next_block_id_v_ =
-                        __builtin_amdgcn_readfirstlane(v_page_block_navigator.prefetch_table_id(
+                        amd_wave_read_first_lane(v_page_block_navigator.prefetch_table_id(
                            i_page_block_v_, v_dram_window_, {0, kK1}));
                    const auto v = load_tile(v_dram_window_); // load next v
                    block_sync_lds();