diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp index 56bdda0090..b1aa3803a9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp @@ -180,8 +180,8 @@ __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_grouped_conv_bwd_data_optimized(const ABDataType* __restrict__ p_weight, - const ABDataType* __restrict__ p_gradOut, + kernel_grouped_conv_bwd_data_optimized(const ABDataType* __restrict__ p_gradOut, + const ABDataType* __restrict__ p_weight, EDataType* __restrict__ p_gradIn) { int grp_idx = 64 * blockIdx.x;