diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 024ea6645b..27e5c4af3f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -1058,7 +1058,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 auto launch_kernel = [&]() { // constexpr bool has_main_loop = has_main_k_block_loop.value; constexpr index_t GroupPerBlock = 64; - constexpr index_t BatchPerBlock = 64; + constexpr index_t BatchPerBlock = 8; const auto kernel = kernel_grouped_conv_bwd_data_optimized