diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp index 97db0f95c4..34ba8d6c47 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp @@ -163,7 +163,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP amd_wave_read_first_lane(integer_divide_ceil(seqlen_q_end - seqlen_q_start, kM0)); // check early exit if no work to do. - if(num_total_loop <= 0) + // __builtin_expect is load-bearing: omitting it causes incorrect AGPR allocation in + // the dK/dV accumulation loop on some compiler versions, leading to wrong results. + if(__builtin_expect(num_total_loop <= 0, 0)) { // Note: here dk_acc&dv_acc are all cleared, return it return make_tuple(dk_acc, dv_acc);