From b29bf601b0ed25e66524425f7b19799b937e18d9 Mon Sep 17 00:00:00 2001 From: Anthony Chang Date: Tue, 21 Jun 2022 23:15:31 +0800 Subject: [PATCH] bring up to date with the usage of __builtin_amdgcn_sched_barrier (#293) [ROCm/composable_kernel commit: 1ae241092f47a7bf78857a8545f84790e70bf1aa] --- .../gpu/block/blockwise_gemm_xdlops.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp index a989cb5297..b93d5ff839 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp @@ -438,7 +438,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 make_tuple(n0, I0, I0, I0), b_thread_buf); }); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except // the first, as we can shorten non-MAC cluster a bit and there's no observable negative // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids @@ -448,7 +448,7 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 if constexpr(k.value != 0 || KPerInnerLoop == KPerThread) { asm volatile("s_barrier" ::); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); } static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) { static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -480,9 +480,9 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 && n0.value == NRepeat - 1) { - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); block_sync_lds(); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); } // TODO: insert setprio in more precise manner since we @@ -493,16 +493,16 @@ struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1 c_thread_buf.GetVectorTypeReference(Number{})); if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0) { - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_s_setprio(1); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); } }); }); }); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); __builtin_amdgcn_s_setprio(0); - __builtin_amdgcn_sched_barrier(); + __builtin_amdgcn_sched_barrier(0); }); }