diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp index 8c8d2af486..6d90429407 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_trload_qr_qtr_dor.hpp @@ -489,7 +489,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR move_tile_window(k_dram_window, {kN0, 0}); async_load_tile(v_lds_write_window, v_dram_window); move_tile_window(v_dram_window, {kN0, 0}); - // __builtin_amdgcn_s_waitcnt(0); + s_waitcnt(); k_reg_tensor = load_tile(k_lds_read_window); v_reg_tensor = load_tile(v_lds_read_window); kt_reg_tensor = load_tile_transpose(kt_lds_read_window); @@ -636,7 +636,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR } }(); store_tile(bias_lds_write_window, dbias); - __builtin_amdgcn_s_waitcnt(3952); + s_waitcnt(); block_sync_lds(); auto shuffled_dbias_tile = load_tile(dbias_lds_read_window); auto dbias_tile = make_static_distributed_tensor( @@ -664,7 +664,7 @@ struct BlockFmhaBwdDQDKDVPipelineTrLoadQRQTRDOR } store_tile(ds_lds_window, ds_gemm); } - __builtin_amdgcn_s_waitcnt(3952); + s_waitcnt(); block_sync_lds(); if constexpr(is_epilogue) {