From 49a79b7ab467db9c9ea5d2986c40bc2a2682eb39 Mon Sep 17 00:00:00 2001 From: Jakub Piasecki Date: Fri, 23 May 2025 12:46:26 +0000 Subject: [PATCH] tmp save, trying processing full kperblock but it doesnt work --- include/ck/stream_config.hpp | 4 +- ...d_preshuffle_multiple_abd_xdl_cshuffle.hpp | 51 ++++--- ...onv_preshuffle_multiple_d_xdl_cshuffle.hpp | 139 +++++++++++++----- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 12 +- .../grid/gridwise_gemm_pipeline_v1_nchw.hpp | 73 +++++++-- .../transform_conv_fwd_to_gemm.hpp | 2 + ...ouped_conv_fwd_preshuffle_xdl_instance.hpp | 44 +++--- .../device_grouped_conv_fwd_xdl_instance.hpp | 2 +- 8 files changed, 230 insertions(+), 97 deletions(-) diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp index 37ba250cf5..63c11ca0d1 100644 --- a/include/ck/stream_config.hpp +++ b/include/ck/stream_config.hpp @@ -11,8 +11,8 @@ struct StreamConfig hipStream_t stream_id_ = nullptr; bool time_kernel_ = false; int log_level_ = 0; - int cold_niters_ = 5; - int nrepeat_ = 50; + int cold_niters_ = 0;//5; + int nrepeat_ = 1;//50; bool flush_cache = false; int rotating_count = 1; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp index 4fad2bc2e9..8b0ad65437 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp @@ -1264,6 +1264,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle { namespace ctc = tensor_layout::convolution; + const index_t W = arg.a_g_n_c_wis_lengths_[I4]; // assume 2d for now const index_t G = arg.b_g_k_c_xs_lengths_[I0]; const index_t K = arg.b_g_k_c_xs_lengths_[I1]; const index_t C = arg.b_g_k_c_xs_lengths_[I2]; @@ -1351,6 +1352,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle return false; } } + //printf("checking a vector \n"); // check vector access of A // FIXME: layout @@ -1362,7 +1364,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle is_same_v || is_same_v) { // Check access per C - if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0)) + if(!(ABlockTransferSrcVectorDim == 2 && W % ABlockTransferSrcScalarPerVector == 0)) // changed from C % to H% { // If not possible, check access per G if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) && @@ -1371,15 +1373,17 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle is_NGCDHW_NGKDHW()) && G % ABlockTransferSrcScalarPerVector == 0)) { + printf("checking a vector failed \n"); return false; } } } else { + printf("checking a vector wtf \n"); return false; } - + // printf("checking b vector \n"); // check vector access of B // FIXME: layout if constexpr(is_same_v || is_same_v || @@ -1448,30 +1452,30 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle if constexpr(is_NGCHW_NGKHW() || is_NGCDHW_NGKDHW()) { - if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0) - { - return false; - } + // if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0) + // { + // return false; + // } - if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0) - { - return false; - } + // if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0) + // { + // return false; + // } - const index_t input_spatial_acum = ck::accumulate_n( - arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); - const index_t output_spatial_acum = ck::accumulate_n( - arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); + // const index_t input_spatial_acum = ck::accumulate_n( + // arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); + // const index_t output_spatial_acum = ck::accumulate_n( + // arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); - if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) - { - return false; - } + // if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) + // { + // return false; + // } - if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) - { - return false; - } + // if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) + // { + // return false; + // } // if(!arg.p_workspace_) // { @@ -1497,7 +1501,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle { return false; } - + // printf("checking e vector \n"); // check vector access of E if constexpr(is_same_v || is_same_v || is_same_v || is_same_v || @@ -1532,6 +1536,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle } else { + //printf("gonig into gemm validation\n"); return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_, arg.b_grid_desc_n_k_, arg.ds_grid_desc_m_n_, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp index 508085529b..be616e1c45 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp @@ -162,19 +162,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle make_tuple(NSlice, PaddedCSlice, HSlice, WSlice), make_tuple(PaddedCSlice * HSlice * WSlice, HSlice * WSlice, WSlice, I1)); - constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor( - nchw_slice_desc, - make_tuple(make_pass_through_transform(NSlice), - make_slice_transform(PaddedCSlice, I0, CSlice), - make_pass_through_transform(HSlice), - make_pass_through_transform(WSlice)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + // constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor( + // nchw_slice_desc, + // make_tuple(make_pass_through_transform(NSlice), + // make_slice_transform(PaddedCSlice, I0, CSlice), + // make_pass_through_transform(HSlice), + // make_pass_through_transform(WSlice)), + // make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + // make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor( - nchw_slice_sliced_desc, + nchw_slice_desc, make_tuple(make_pass_through_transform(NSlice), - make_pass_through_transform(CSlice), + make_pass_through_transform(PaddedCSlice), make_pass_through_transform(HSlice), make_pad_transform(WSlice, InLeftPadW, InRightPadW)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), @@ -183,7 +183,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto n_c_y_ho_x_wo_desc = transform_tensor_descriptor( nchw_padded_slice_desc, make_tuple(make_pass_through_transform(NSlice), - make_pass_through_transform(CSlice), + make_pass_through_transform(PaddedCSlice), make_embed_transform(make_tuple(Y, HoutSlice), make_tuple(ConvDilationH, ConvStrideH)), make_embed_transform(make_tuple(X, WoutSlice), @@ -196,19 +196,26 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto mk_desc = transform_tensor_descriptor( n_c_y_ho_x_wo_desc, make_tuple(make_merge_transform(make_tuple(NSlice, HoutSlice, WoutSlice)), - make_merge_transform(make_tuple(CSlice, Y, X))), + make_merge_transform(make_tuple(PaddedCSlice, Y, X))), make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock; - - constexpr auto mk_pad_desc = transform_tensor_descriptor( + constexpr auto mk_desc_sliced = transform_tensor_descriptor( mk_desc, make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice), - make_right_pad_transform(CSlice * Y * X, FinalMKPad)), + make_slice_transform(PaddedCSlice * Y * X, I0, KPerBlock)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); + // constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock; + + // constexpr auto mk_pad_desc = transform_tensor_descriptor( + // mk_desc, + // make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice), + // make_right_pad_transform(PaddedCSlice * Y * X, FinalMKPad)), + // make_tuple(Sequence<0>{}, Sequence<1>{}), + // make_tuple(Sequence<0>{}, Sequence<1>{})); + // if(threadIdx.x == 0 && blockIdx.x == 0) { // printf("nchw pad slice%d\n", nchw_padded_slice_desc.GetElementSize()); // printf("nhowocyx %d\n", nhowocyx_desc.GetElementSize()); @@ -218,7 +225,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto AK0 = KPerBlock / AK1; - return transform_tensor_descriptor(mk_pad_desc, + return transform_tensor_descriptor(mk_desc_sliced, make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)), make_pass_through_transform(MPerBlock)), make_tuple(Sequence<1>{}, Sequence<0>{}), @@ -233,6 +240,16 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle make_tuple(Number{} * AK1, AK1, I1)); } + // __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1_padded() + // { + // // B matrix in LDS memory, dst of blockwise copy + // //constexpr BK0 = PaddedCSlice * X * Y; + + // return make_naive_tensor_descriptor( + // make_tuple(BK0PerBlock, Number{}, BK1), + // make_tuple(Number{} * BK1, BK1, I1)); + // } + __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1() { // B matrix in LDS memory, dst of blockwise copy @@ -516,7 +533,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / TrueKPerBlock; + const index_t num_loop = K / KPerBlock; return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -620,6 +637,27 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle __device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; } + template + __host__ __device__ static auto GetAGridDescriptor_NCHW_BlockSlice(const AGridDesc_NCHW& a_grid_desc_nchw, const index_t& m_block_idx) + { + + + const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2); + + //const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) / Hout); + //const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) % Hout); + + return transform_tensor_descriptor( + a_grid_desc_nchw, + make_tuple(make_pass_through_transform(a_grid_desc_nchw.GetLength(I0)), + make_pass_through_transform(a_grid_desc_nchw.GetLength(I1)), + make_slice_transform(a_grid_desc_nchw.GetLength(I2), (m_block_idx * HoutSlice) % Hout, HSlice), + make_pass_through_transform(a_grid_desc_nchw.GetLength(I3))), + + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + } + template ,//ABlockTransferThreadClusterArrangeOrder, ADataType, AComputeDataType, - decltype(a_grid_desc_nchw), + decltype(a_grid_desc_nchw_block_slice), decltype(a_block1_desc_cslice_hslice_wslice), Sequence<0, 1, 2, 3>,//ABlockTransferSrcAccessOrder, Sequence<0, 1, 2, 3>,//Sequence<1, 0, 2>, @@ -742,8 +782,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle AThreadTransferSrcResetCoordinateAfterRun, true, NumGemmKPrefetchStage>( - a_grid_desc_nchw, - make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, m_block_data_idx_on_grid_nchw_h, 0), + a_grid_desc_nchw_block_slice, + make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, 0, 0), // H slice is done inside of grid desc a_element_op, a_block1_desc_cslice_hslice_wslice, make_multi_index(0, 0, 0, 0), @@ -770,7 +810,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle ABlockTransferDstScalarPerVector_AK1,//ABlockTransferDstScalarPerVector_AK1, 1, 1, - true,//AThreadTransferSrcResetCoordinateAfterRun, + false,// was triue nad working before changes//AThreadTransferSrcResetCoordinateAfterRun, true, NumGemmKPrefetchStage>( a_block1_desc_nhowo_cyx, @@ -904,9 +944,40 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle b_block_desc_bk0_n_bk1.GetElementSpaceSize()); //constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0); - constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); + //constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); // make multiple steps + // template + // using MultiIndex = StaticallyIndexedArray; + [[maybe_unused]] StaticallyIndexedArray a_block1_slice_copy_steps[9] = { + make_multi_index(0, CSlice, 1, 0), + make_multi_index(0, CSlice, 2, 0), + make_multi_index(0, CSlice, 2, 0), + make_multi_index(0, CSlice, 1, 0), + make_multi_index(0, CSlice, 2, 0), + make_multi_index(0, CSlice, 2, 0), + make_multi_index(0, CSlice, 1, 0), + make_multi_index(0, CSlice, 2, 0), + make_multi_index(0, CSlice, 2, 0) + }; + + [[maybe_unused]] StaticallyIndexedArray a_block2_slice_copy_steps[9] = { + make_multi_index(0, 0, 2), // 2 + make_multi_index(0, 0, -1), // -1 + make_multi_index(0, 0, -1), // -1 + make_multi_index(0, 0, 2), // 2 + make_multi_index(0, 0, -1), // -1 + make_multi_index(0, 0, -1), // -1 + make_multi_index(0, 0, 2), // 2 + make_multi_index(0, 0, -1), // -1 + make_multi_index(0, 0, -1) // -1 + }; + // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 1, 0); // make multiple steps + // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps + // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps + //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 2); // as Y % mod(KPerBlock - CSlice*Y*X) + //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 1); // + //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 0); //constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(TrueKPerBlock / BK1, 0, TrueKPerBlock % BK1); + constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); // gridwise GEMM pipeline const auto gridwise_gemm_pipeline = @@ -914,15 +985,14 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / - TrueKPerBlock) + ((a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) % - TrueKPerBlock != 0); // TrueKPerBlock ? + KPerBlock); // TrueKPerBlock ? - // if(threadIdx.x == 0) { - // printf("hasmainkblockloop: %d\n", HasMainKBlockLoop); - // printf("Running %d k block iters", num_k_block_main_loop); - // } + if(threadIdx.x == 0 && blockIdx.x == 0) { + printf("hasmainkblockloop: %d\n", HasMainKBlockLoop); + printf("Running %d k block iters\n", num_k_block_main_loop); + } - gridwise_gemm_pipeline.template Run(a_grid_desc_nchw, // global + gridwise_gemm_pipeline.template Run(a_grid_desc_nchw_block_slice, // global a_block1_desc_cslice_hslice_wslice, // lds1 write a_block1_desc_nhowo_cyx, // lds1 read a_block2_desc_ak0_m_ak1, // lds2 write/read @@ -931,7 +1001,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle a_grid_buf, a_block_buf1, a_block_buf2, - a_block_slice_copy_step, + a_block1_slice_copy_steps, + a_block2_slice_copy_steps, b_grid_desc_bk0_n_bk1, b_block_desc_bk0_n_bk1, b_blockwise_copy, @@ -1028,7 +1099,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle ThreadwiseTensorSliceTransfer_v1r3 // if(threadIdx.x == 0){ // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<8; ++k0) { + // for(int k0=0; k0<4; ++k0) { // for(int k1=0;k1<8;++k1) { // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block_buf[k0*8*128 + m*8 + k1])); // } @@ -102,6 +102,16 @@ struct GridwiseGemmPipeline_v1<1, true, true> a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + block_sync_lds(); + // if(threadIdx.x == 0){ + // for(int m=0; m<128; ++m) { + // for(int k0=0; k0<4; ++k0) { + // for(int k1=0;k1<8;++k1) { + // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block_buf[k0*8*128 + m*8 + k1])); + // } + // } + // } + // } ++i; } while(i < (num_loop - 1)); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp index 36fb73432b..6f9f11be6b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp @@ -36,7 +36,8 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> typename AGridBuffer, typename ABlock1Buffer, typename ABlock2Buffer, - typename ABlockTransferStep, + typename ABlock1TransferSteps, + typename ABlock2TransferSteps, typename BGridDesc, typename BBlockDesc, typename BBlockTransfer, @@ -54,7 +55,8 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> [[maybe_unused]] const AGridBuffer& a_grid_buf, [[maybe_unused]] ABlock1Buffer& a_block1_buf, [[maybe_unused]] ABlock2Buffer& a_block2_buf, - [[maybe_unused]] const ABlockTransferStep& a_block_copy_step, + [[maybe_unused]] const ABlock1TransferSteps& a_block1_copy_steps, + [[maybe_unused]] const ABlock2TransferSteps& a_block2_copy_steps, [[maybe_unused]] const BGridDesc& b_grid_desc, [[maybe_unused]] const BBlockDesc& b_block_desc, [[maybe_unused]] BBlockTransfer& b_blockwise_copy, @@ -69,7 +71,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); - a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[0]); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); //block_sync_lds(); a_blockwise_global_to_lds1_copy.RunWrite(a_block1_desc_nchw_slice, a_block1_buf); @@ -83,9 +85,11 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf);//a_block2_buf); ?? jaki bufor a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf); //block_sync_lds(); + a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[0]); // if(threadIdx.x == 0) { - // for(int c=0; c<4; ++c) { + // printf("Next iter\n"); + // for(int c=0; c<5; ++c) { // for(int h=0; h<2;++h) { // for(int w=0; w<64;++w) { // printf("Ag[%d][%d][%d]=%f\n", c, h, w, static_cast(a_grid_buf[c * 64 * 2 + h * 64 + w])); @@ -93,9 +97,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> // } // } - // // A[3][2][33]=0.000000 + // // // A[3][2][33]=0.000000 - // for(int c=0; c<8; ++c) { + // for(int c=0; c<4; ++c) { // for(int h=0; h<4;++h) { // for(int w=0; w<64;++w) { // printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast(a_block1_buf[c * 64 * 4 + h * 64 + w])); @@ -104,14 +108,14 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> // } // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<8; ++k0) { + // for(int k0=0; k0<4; ++k0) { // for(int k1=0;k1<8;++k1) { // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block2_buf[k0*8*128 + m*8 + k1])); - // int k = k0*8+k1; - // int x = k%3; - // int y = (k/3)%3; - // int c = k/(9); + // // int k = k0*8+k1; + // // int x = k%3; + // // int y = (k/3)%3; + // // int c = k/(9); // // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka @@ -142,6 +146,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> do { // ask bartek how to structure this pipeline a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); // A Global -> VGPR + block_sync_lds(); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); // B Global -> VGPR // @@ -151,13 +156,53 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> block_sync_lds(); a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf); // A LDS1 -> LDS2 - a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[i+1]); + a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[i+1]); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - //block_sync_lds(); a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - //block_sync_lds(); + // block_sync_lds(); // delete later + + // if(threadIdx.x == 0) { + // printf("Next iter\n"); + + // // // A[3][2][33]=0.000000 + + // for(int c=0; c<4; ++c) { + // for(int h=0; h<4;++h) { + // for(int w=0; w<64;++w) { + // printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast(a_block1_buf[c * 64 * 4 + h * 64 + w])); + // } + // } + // } + + // for(int m=0; m<128; ++m) { + // for(int k0=0; k0<4; ++k0) { + // for(int k1=0;k1<8;++k1) { + // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block2_buf[k0*8*128 + m*8 + k1])); + + // // int k = k0*8+k1; + // // int x = k%3; + // // int y = (k/3)%3; + // // int c = k/(9); + + // // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka + + // // if(x+m%64 > 0 && x+m%64 < 127) { + // // auto lds1 = static_cast(a_block1_buf[c * 64 * 4 + (m/64 + y) * 64 + m%64 + x - 1]); + // // auto lds2 = static_cast(a_block2_buf[k0*8*128 + m*8 + k1]); + // // printf("lds2[%d][%d]:%f lds1[%d][%d][%d]:%f %s\n", m, k0*8+k1, lds2, c, y + m/64, m%64 + x - 1, lds1, (lds1 > lds2)? "diff" : ""); + + // // } + + // // if(lds1 > lds2) { + // // printf("diff lds1[%d][%d][%d]:%f lds2[%d][%d]:%f\n", c, y + m/64, m%64 + x, lds1, m, k0*8+k1 - 1, lds2); + // // } + // } + // } + // } + // } ++i; } while(i < (num_loop - 1)); diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index eb15d8eeca..fb6c0c4466 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -1477,6 +1477,8 @@ struct TransformConvFwdToGemm } } +// NGCHW x KCYX +// NHoWoCYX KCYX = NHoWoK template , S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>, + // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 2>, // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, // best DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8> // best - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8> +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8> // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp index a3b9228508..39c8837ec3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp @@ -127,7 +127,7 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple< //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // // generic instance - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1> + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1> // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,