From 6575a782c6679e56d813b49c25ee81be2daf0b1d Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Fri, 4 Jul 2025 15:49:52 +0200 Subject: [PATCH] Always force output clearing for grouped conv bwd data (#2446) * Always force output clearing * dont run set zero for residual --------- Co-authored-by: Bartlomiej Kocot [ROCm/composable_kernel commit: 3d70c638d1a217869fe0d90636232d239786b4e5] --- ...uped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index efb91bd13d..89a304fda4 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -611,7 +611,19 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 // If stride is larger than windows size then we will have some empty places image_covered_strides &= conv_filter_strides[d] <= b_g_k_c_xs_lengths[d + I3]; } + bool if_d_is_output_mem = false; + const void* out_mem_void = static_cast(p_e); + static_for<0, NumDTensor, 1>{}([&](auto i) { + if(p_ds[i] == out_mem_void) + { + if_d_is_output_mem = true; + } + }); + bwd_needs_zero_out = k_batch_ > 1 || !image_covered_dilation || !image_covered_strides; + + // Temporary workaround untill prove/fix above conditions. + bwd_needs_zero_out = !if_d_is_output_mem; e_space_size_bytes = ck::accumulate_n( e_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>()) *