diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 24dfddda16..917dca9e9c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -6,8 +6,8 @@ #include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" -#include "blockwise_gemm.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_gemm.hpp" namespace ck { @@ -115,16 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t BBlockWork = B / BPerBlock; - constexpr auto block_work_desc = transform_tensor_descriptor( - make_native_tensor_descriptor_packed(Sequence{}), - make_tuple(Merge>{}), - make_tuple(Sequence<0, 1>{}), - make_tuple(Sequence<0>{})); + constexpr auto block_work_desc = + make_cluster_descriptor(Sequence{}); - const auto block_work_multi_id = block_work_desc.CalculateLowerIndex(get_block_1d_id()); + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; // input tensor // global memory @@ -185,11 +182,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf // weight tensor // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_k_e_global_desc = - unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); - - constexpr auto wei_e_k_global_desc = - reorder_tensor_descriptor_given_upper2lower(wei_k_e_global_desc, Sequence<1, 0>{}); + constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 073cd2cc27..c0022462c6 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -2,12 +2,12 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" -#include "blockwise_gemm.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" +#include "blockwise_gemm.hpp" namespace ck { @@ -103,13 +103,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_cluster_descriptor(Sequence{}); - const auto block_work_multi_id = - block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); + const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id()); - const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; - const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; + const index_t k_block_data_on_global = block_work_id[0] * KPerBlock; + const index_t b_block_data_on_global = block_work_id[1] * BPerBlock; // input tensor // global mem @@ -157,21 +156,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf InBlockCopyDataPerAccess_B>( {0, b_block_data_on_global}, {0, 0}); -// weight tensor -// global mem -#if 0 - constexpr auto wei_e_k_global_desc = - transform_tensor_descriptor(wei_k_c_y_x_global_desc, - make_tuple(Merge>{}, PassThrough{}), - make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); -#else // hack - constexpr auto wei_e_k_global_desc_old = - WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); - - constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor( - wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides()); -#endif + // weight tensor + // global mem + constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower( + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{}); // LDS // be careful of LDS alignment @@ -267,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run( + blockwise_in_copy.template Run( p_in_global, p_in_block_double); - blockwise_wei_copy.template Run( + blockwise_wei_copy.template Run( p_wei_global, p_wei_block_double); } @@ -292,8 +280,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf Float* p_wei_block_next = even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); blockwise_wei_copy.MoveSrcSliceWindow(Sequence{}, True); @@ -301,26 +289,26 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadRegisterBuffer( - p_in_global, p_in_register_buffer); - blockwise_wei_copy.template RunLoadRegisterBuffer( - p_wei_global, p_wei_register_buffer); + blockwise_in_copy + .template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy + .template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.template RunStoreRegisterBuffer( - p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.template RunStoreRegisterBuffer( - p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, p_wei_block_next); } } // LDS double buffer: tail { - Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()]; - Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()]; + Float p_in_thread_buffer[blockwise_in_copy.GetThreadBufferSize()]; + Float p_wei_thread_buffer[blockwise_wei_copy.GetThreadBufferSize()]; // even iteration blockwise_in_copy.MoveSrcSliceWindow(Sequence{}, True); @@ -329,19 +317,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadRegisterBuffer( - p_in_global, p_in_register_buffer); - blockwise_wei_copy.template RunLoadRegisterBuffer( - p_wei_global, p_wei_register_buffer); + blockwise_in_copy.template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy.template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.template RunStoreRegisterBuffer( - p_in_register_buffer, p_in_block_double + in_block_space); - blockwise_wei_copy.template RunStoreRegisterBuffer( - p_wei_register_buffer, p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreThreadBuffer(p_in_thread_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreThreadBuffer(p_wei_thread_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -402,10 +390,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global / B1, b_thread_data_on_global % B1}) #if 1 - .template Run_generic -#elif 1 + .template Run_generic +#else // tweaking .template Run_optimized_dst_address_calculation #endif (p_out_thread, p_out_global); diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index cd70d05f2a..103904a908 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -132,7 +132,7 @@ struct Merge __host__ __device__ static constexpr auto GetUpperLengths() { - return Sequence{}, Number<1>{})>{}; } diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index d4d80a2138..65fe698509 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -149,6 +149,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript } #if 0 +// not implemented template +struct ClusterDescriptor +{ + static constexpr index_t nDim = Lengths::Size(); + + static constexpr auto mDesc = transform_tensor_descriptor( + make_native_tensor_descriptor_packed(Lengths{}), + make_tuple(Merge{}), + make_tuple(ArrangeOrder{}), + make_tuple(Sequence<0>{})); + + __host__ __device__ constexpr ClusterDescriptor() + { + static_assert(Lengths::Size() == nDim && ArrangeOrder::Size() == nDim, + "wrong! size not the same"); + + static_assert(is_valid_sequence_map{}, "wrong! ArrangeOrder is wrong"); + } + + __host__ __device__ static constexpr index_t GetElementSize() { return mDesc.GetElementSize(); } + + __host__ __device__ static constexpr auto CalculateClusterIndex(index_t idx_1d) + { + return mDesc.CalculateLowerIndex(MultiIndex<1>{idx_1d}); + } +}; + +template ::type> +__host__ __device__ constexpr auto make_cluster_descriptor( + Lengths, ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{}) +{ + return ClusterDescriptor{}; +} + template __host__ __device__ void print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index c7005515a1..b662440b1e 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -2,13 +2,10 @@ #define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" -#include "tensor_coordinate.hpp" -#include "tensor_view.hpp" -#include "threadwise_generic_tensor_slice_copy.hpp" #include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "tensor_coordinate_v2.hpp" +#include "threadwise_generic_tensor_slice_copy.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 @@ -16,6 +13,8 @@ namespace ck { +#if 0 + // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor // memory layout (ordering of dimensions) can be different between src and dst. // This functions assume each thread is reading and writing a normal (not merged) tensor, @@ -677,6 +676,8 @@ struct BlockwiseGenericTensorSliceCopy_v3 ThreadwiseStore mThreadwiseStore; }; +#endif + template {}, "wrong! threads should be mapped to cover entire slicing window"); -#if 1 - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); -#else - constexpr auto thread_cluster_lengths_in_arrange_order = - ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}); - - constexpr auto thread_cluster_desc = transform_tensor_descriptor( - make_native_tensor_descriptor_packed(thread_cluster_lengths_in_arrange_order), - make_tuple(Merge{}), - make_tuple(arithmetic) - - ::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); - - static_assert(BlockSize == thread_cluster_desc.GetElementSize(), - "wrong! BlockSize not consistent with ThreadClusterLengths"); - - constexpr auto thread_cluster_id = transform_tensor_descriptor( - make_native_tensor_descriptor_packed(Sequence{}), - make_tuple(Merge>{}), - make_tuple(Sequence<0, 1>{}), - make_tuple(Sequence<0>{})); - - const auto block_work_multi_id = block_work_desc.CalculateLowerIndex(get_block_1d_id()); -#endif + // map threads to cluster + constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize not consistent with ThreadClusterLengths"); const auto thread_cluster_id = - thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + thread_cluster_desc.CalculateClusterIndex(get_thread_local_1d_id()); - const auto data_cluster_id = - reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); - - const auto thread_data_id_begin = data_cluster_id * ThreadSliceLengths{}; + const auto thread_data_id_begin = thread_cluster_id * ThreadSliceLengths{}; mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); mThreadwiseLoad.SetDstSliceOrigin(make_zero_array()); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 0fe14ecb9d..8012d27519 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -2,11 +2,8 @@ #define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" -#include "tensor_coordinate.hpp" -#include "tensor_view.hpp" #include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "tensor_coordinate_v2.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 @@ -23,6 +20,8 @@ namespace ck { +#if 0 + // This threadwise copy allow vector access of src and dst. // It allows the dimensions of vector access to be different on src and dst. // It also allows the vector size to be different on src and dst. @@ -1121,6 +1120,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 DstSlice mDstSlice; }; +#endif + // This version use multi-index transformation // This threadwise copy allow vector access of src and dst. // It allows the vector size to be different on src and dst. diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp index 85855780e3..ac4cf5eb5e 100644 --- a/composable_kernel/include/utility/sequence.hpp +++ b/composable_kernel/include/utility/sequence.hpp @@ -473,6 +473,13 @@ struct sequence_sort_impl, Sequence, Compare> using sorted_ids = Sequence; }; +template +struct sequence_sort_impl, Sequence<>, Compare> +{ + using sorted_values = Sequence<>; + using sorted_ids = Sequence<>; +}; + template struct sequence_sort { diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 0a0f988b1f..844af7f8dc 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -177,52 +177,52 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + constexpr auto gridwise_conv = +#if 0 + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded +#else + GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer +#endif + {}; + for(index_t i = 0; i < nrepeat; ++i) { - constexpr auto gridwise_conv = -#if 0 - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded -#else - GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer -#endif - {}; - float time = launch_kernel(run_gridwise_convolution_kernel, dim3(GridSize), dim3(BlockSize), diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index 974b89e0c9..3e01cef737 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 128; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 98139081ca..d325e5f71f 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -14,7 +14,7 @@ //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" -#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" @@ -103,7 +103,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -366,6 +366,10 @@ int main(int argc, char* argv[]) ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); + print_sequence("LeftPads", LeftPads{}); + print_sequence("RightPads", RightPads{}); + print_sequence("ConvStrides", ConvStrides{}); + print_sequence("ConvDilations", ConvDilations{}); using in_data_t = float; using out_data_t = float; @@ -444,7 +448,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -486,7 +490,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc,