From e98473853487c1a658d982605d47ad9bd2e4d856 Mon Sep 17 00:00:00 2001 From: Yi DING Date: Fri, 19 Sep 2025 21:45:02 +0800 Subject: [PATCH] [CK_TILE] FMHA BWD Fix Decode Accuracy (#2881) * [CK_TILE] FMHA BWD Fix Decode Accuracy * use s_waitcnt utils [ROCm/composable_kernel commit: 6cf3fdd21c502249767f814a087fbd9be88013eb] --- .../block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp index 8c8d2af486..6d90429407 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp @@ -489,7 +489,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR move_tile_window(k_dram_window, {kN0, 0}); async_load_tile(v_lds_write_window, v_dram_window); move_tile_window(v_dram_window, {kN0, 0}); - // __builtin_amdgcn_s_waitcnt(0); + s_waitcnt(); k_reg_tensor = load_tile(k_lds_read_window); v_reg_tensor = load_tile(v_lds_read_window); kt_reg_tensor = load_tile_transpose(kt_lds_read_window); @@ -636,7 +636,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR } }(); store_tile(bias_lds_write_window, dbias); - __builtin_amdgcn_s_waitcnt(3952); + s_waitcnt(); block_sync_lds(); auto shuffled_dbias_tile = load_tile(dbias_lds_read_window); auto dbias_tile = make_static_distributed_tensor( @@ -664,7 +664,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR } store_tile(ds_lds_window, ds_gemm); } - __builtin_amdgcn_s_waitcnt(3952); + s_waitcnt(); block_sync_lds(); if constexpr(is_epilogue) {