diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp index f25648efa6..004a867898 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp @@ -289,6 +289,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1((i + 2) % num_loop_per_scale == 0); + if constexpr(ck::is_same::value == false) + { + block_sync_lds(); + } a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); @@ -631,6 +635,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1((i + 2) % num_loop_per_scale == 0); + if constexpr(ck::is_same::value == false) + { + block_sync_lds(); + } a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);