Fix the calculation of number of instructions used by sched_group_barrier

2026-07-15 11:34:54 +00:00 · 2025-07-15 08:59:48 +00:00
parent 0206b3442a
commit c01695597e
1 changed files with 5 additions and 4 deletions
--- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp
+++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp
@@ -363,9 +363,10 @@ struct HstuAttentionFwdPipelineQRKSVS
                if constexpr(HstuMask::kUseLocal)
                {
                    constexpr index_t V_VMEM_LOAD_INST = (kN1 * kK1) / kBlockSize / kAlignmentV;
-                    constexpr index_t K_VMEM_LOAD_INST = (kN0 * kK0) / kBlockSize / kAlignmentV;
-                    constexpr index_t K_LDS_WRITE_INST =
-                        (kN0 * kK0) / kBlockSize / Policy::template GetSmemKPackK<Problem>();
+                    constexpr index_t K_VMEM_LOAD_INST =
+                        (kK1 * kSubQKHeaddim) / kBlockSize / kAlignmentK;
+                    constexpr index_t K_LDS_WRITE_INST = (kK1 * kSubQKHeaddim) / kBlockSize /
+                                                         Policy::template GetSmemKPackK<Problem>();
                    constexpr index_t MFMA_INST       = (kM0 * kSubQKHeaddim) / kBlockSize / 4;
                    constexpr index_t K_LDS_READ_INST = MFMA_INST / kGemmNumRepM;

@@ -401,7 +402,7 @@ struct HstuAttentionFwdPipelineQRKSVS

                    __builtin_amdgcn_sched_group_barrier(0x00000008, kGemmNumRepM, 0);

-                    __builtin_amdgcn_sched_barrier(0);
+                    __builtin_amdgcn_sched_barrier(0x00000001);
                }
                else
                {