diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp index 63c11ca0d1..37ba250cf5 100644 --- a/include/ck/stream_config.hpp +++ b/include/ck/stream_config.hpp @@ -11,8 +11,8 @@ struct StreamConfig hipStream_t stream_id_ = nullptr; bool time_kernel_ = false; int log_level_ = 0; - int cold_niters_ = 0;//5; - int nrepeat_ = 1;//50; + int cold_niters_ = 5; + int nrepeat_ = 50; bool flush_cache = false; int rotating_count = 1; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp index 8b0ad65437..4fad2bc2e9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp @@ -1264,7 +1264,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle { namespace ctc = tensor_layout::convolution; - const index_t W = arg.a_g_n_c_wis_lengths_[I4]; // assume 2d for now const index_t G = arg.b_g_k_c_xs_lengths_[I0]; const index_t K = arg.b_g_k_c_xs_lengths_[I1]; const index_t C = arg.b_g_k_c_xs_lengths_[I2]; @@ -1352,7 +1351,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle return false; } } - //printf("checking a vector \n"); // check vector access of A // FIXME: layout @@ -1364,7 +1362,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle is_same_v || is_same_v) { // Check access per C - if(!(ABlockTransferSrcVectorDim == 2 && W % ABlockTransferSrcScalarPerVector == 0)) // changed from C % to H% + if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0)) { // If not possible, check access per G if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) && @@ -1373,17 +1371,15 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle is_NGCDHW_NGKDHW()) && G % ABlockTransferSrcScalarPerVector == 0)) { - printf("checking a vector failed \n"); return false; } } } else { - printf("checking a vector wtf \n"); return false; } - // printf("checking b vector \n"); + // check vector access of B // FIXME: layout if constexpr(is_same_v || is_same_v || @@ -1452,30 +1448,30 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle if constexpr(is_NGCHW_NGKHW() || is_NGCDHW_NGKDHW()) { - // if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0) - // { - // return false; - // } + if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } - // if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0) - // { - // return false; - // } + if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } - // const index_t input_spatial_acum = ck::accumulate_n( - // arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); - // const index_t output_spatial_acum = ck::accumulate_n( - // arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); + const index_t input_spatial_acum = ck::accumulate_n( + arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); + const index_t output_spatial_acum = ck::accumulate_n( + arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>()); - // if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) - // { - // return false; - // } + if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } - // if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) - // { - // return false; - // } + if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } // if(!arg.p_workspace_) // { @@ -1501,7 +1497,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle { return false; } - // printf("checking e vector \n"); + // check vector access of E if constexpr(is_same_v || is_same_v || is_same_v || is_same_v || @@ -1536,7 +1532,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle } else { - //printf("gonig into gemm validation\n"); return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_, arg.b_grid_desc_n_k_, arg.ds_grid_desc_m_n_, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp index be616e1c45..508085529b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp @@ -162,19 +162,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle make_tuple(NSlice, PaddedCSlice, HSlice, WSlice), make_tuple(PaddedCSlice * HSlice * WSlice, HSlice * WSlice, WSlice, I1)); - // constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor( - // nchw_slice_desc, - // make_tuple(make_pass_through_transform(NSlice), - // make_slice_transform(PaddedCSlice, I0, CSlice), - // make_pass_through_transform(HSlice), - // make_pass_through_transform(WSlice)), - // make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - // make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor( + constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor( nchw_slice_desc, make_tuple(make_pass_through_transform(NSlice), - make_pass_through_transform(PaddedCSlice), + make_slice_transform(PaddedCSlice, I0, CSlice), + make_pass_through_transform(HSlice), + make_pass_through_transform(WSlice)), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + + constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor( + nchw_slice_sliced_desc, + make_tuple(make_pass_through_transform(NSlice), + make_pass_through_transform(CSlice), make_pass_through_transform(HSlice), make_pad_transform(WSlice, InLeftPadW, InRightPadW)), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), @@ -183,7 +183,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto n_c_y_ho_x_wo_desc = transform_tensor_descriptor( nchw_padded_slice_desc, make_tuple(make_pass_through_transform(NSlice), - make_pass_through_transform(PaddedCSlice), + make_pass_through_transform(CSlice), make_embed_transform(make_tuple(Y, HoutSlice), make_tuple(ConvDilationH, ConvStrideH)), make_embed_transform(make_tuple(X, WoutSlice), @@ -196,26 +196,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto mk_desc = transform_tensor_descriptor( n_c_y_ho_x_wo_desc, make_tuple(make_merge_transform(make_tuple(NSlice, HoutSlice, WoutSlice)), - make_merge_transform(make_tuple(PaddedCSlice, Y, X))), + make_merge_transform(make_tuple(CSlice, Y, X))), make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - constexpr auto mk_desc_sliced = transform_tensor_descriptor( + constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock; + + constexpr auto mk_pad_desc = transform_tensor_descriptor( mk_desc, make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice), - make_slice_transform(PaddedCSlice * Y * X, I0, KPerBlock)), + make_right_pad_transform(CSlice * Y * X, FinalMKPad)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - // constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock; - - // constexpr auto mk_pad_desc = transform_tensor_descriptor( - // mk_desc, - // make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice), - // make_right_pad_transform(PaddedCSlice * Y * X, FinalMKPad)), - // make_tuple(Sequence<0>{}, Sequence<1>{}), - // make_tuple(Sequence<0>{}, Sequence<1>{})); - // if(threadIdx.x == 0 && blockIdx.x == 0) { // printf("nchw pad slice%d\n", nchw_padded_slice_desc.GetElementSize()); // printf("nhowocyx %d\n", nhowocyx_desc.GetElementSize()); @@ -225,7 +218,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle constexpr auto AK0 = KPerBlock / AK1; - return transform_tensor_descriptor(mk_desc_sliced, + return transform_tensor_descriptor(mk_pad_desc, make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)), make_pass_through_transform(MPerBlock)), make_tuple(Sequence<1>{}, Sequence<0>{}), @@ -240,16 +233,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle make_tuple(Number{} * AK1, AK1, I1)); } - // __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1_padded() - // { - // // B matrix in LDS memory, dst of blockwise copy - // //constexpr BK0 = PaddedCSlice * X * Y; - - // return make_naive_tensor_descriptor( - // make_tuple(BK0PerBlock, Number{}, BK1), - // make_tuple(Number{} * BK1, BK1, I1)); - // } - __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1() { // B matrix in LDS memory, dst of blockwise copy @@ -533,7 +516,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / KPerBlock; + const index_t num_loop = K / TrueKPerBlock; return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -637,27 +620,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle __device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; } - template - __host__ __device__ static auto GetAGridDescriptor_NCHW_BlockSlice(const AGridDesc_NCHW& a_grid_desc_nchw, const index_t& m_block_idx) - { - - - const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2); - - //const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) / Hout); - //const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) % Hout); - - return transform_tensor_descriptor( - a_grid_desc_nchw, - make_tuple(make_pass_through_transform(a_grid_desc_nchw.GetLength(I0)), - make_pass_through_transform(a_grid_desc_nchw.GetLength(I1)), - make_slice_transform(a_grid_desc_nchw.GetLength(I2), (m_block_idx * HoutSlice) % Hout, HSlice), - make_pass_through_transform(a_grid_desc_nchw.GetLength(I3))), - - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - } - template ,//ABlockTransferThreadClusterArrangeOrder, ADataType, AComputeDataType, - decltype(a_grid_desc_nchw_block_slice), + decltype(a_grid_desc_nchw), decltype(a_block1_desc_cslice_hslice_wslice), Sequence<0, 1, 2, 3>,//ABlockTransferSrcAccessOrder, Sequence<0, 1, 2, 3>,//Sequence<1, 0, 2>, @@ -782,8 +742,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle AThreadTransferSrcResetCoordinateAfterRun, true, NumGemmKPrefetchStage>( - a_grid_desc_nchw_block_slice, - make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, 0, 0), // H slice is done inside of grid desc + a_grid_desc_nchw, + make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, m_block_data_idx_on_grid_nchw_h, 0), a_element_op, a_block1_desc_cslice_hslice_wslice, make_multi_index(0, 0, 0, 0), @@ -810,7 +770,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle ABlockTransferDstScalarPerVector_AK1,//ABlockTransferDstScalarPerVector_AK1, 1, 1, - false,// was triue nad working before changes//AThreadTransferSrcResetCoordinateAfterRun, + true,//AThreadTransferSrcResetCoordinateAfterRun, true, NumGemmKPrefetchStage>( a_block1_desc_nhowo_cyx, @@ -944,40 +904,9 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle b_block_desc_bk0_n_bk1.GetElementSpaceSize()); //constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0); - //constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); // make multiple steps - // template - // using MultiIndex = StaticallyIndexedArray; - [[maybe_unused]] StaticallyIndexedArray a_block1_slice_copy_steps[9] = { - make_multi_index(0, CSlice, 1, 0), - make_multi_index(0, CSlice, 2, 0), - make_multi_index(0, CSlice, 2, 0), - make_multi_index(0, CSlice, 1, 0), - make_multi_index(0, CSlice, 2, 0), - make_multi_index(0, CSlice, 2, 0), - make_multi_index(0, CSlice, 1, 0), - make_multi_index(0, CSlice, 2, 0), - make_multi_index(0, CSlice, 2, 0) - }; - - [[maybe_unused]] StaticallyIndexedArray a_block2_slice_copy_steps[9] = { - make_multi_index(0, 0, 2), // 2 - make_multi_index(0, 0, -1), // -1 - make_multi_index(0, 0, -1), // -1 - make_multi_index(0, 0, 2), // 2 - make_multi_index(0, 0, -1), // -1 - make_multi_index(0, 0, -1), // -1 - make_multi_index(0, 0, 2), // 2 - make_multi_index(0, 0, -1), // -1 - make_multi_index(0, 0, -1) // -1 - }; - // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 1, 0); // make multiple steps - // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps - // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps - //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 2); // as Y % mod(KPerBlock - CSlice*Y*X) - //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 1); // - //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 0); + constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); //constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(TrueKPerBlock / BK1, 0, TrueKPerBlock % BK1); // gridwise GEMM pipeline const auto gridwise_gemm_pipeline = @@ -985,14 +914,15 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / - KPerBlock); // TrueKPerBlock ? + TrueKPerBlock) + ((a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) % + TrueKPerBlock != 0); // TrueKPerBlock ? - if(threadIdx.x == 0 && blockIdx.x == 0) { - printf("hasmainkblockloop: %d\n", HasMainKBlockLoop); - printf("Running %d k block iters\n", num_k_block_main_loop); - } + // if(threadIdx.x == 0) { + // printf("hasmainkblockloop: %d\n", HasMainKBlockLoop); + // printf("Running %d k block iters", num_k_block_main_loop); + // } - gridwise_gemm_pipeline.template Run(a_grid_desc_nchw_block_slice, // global + gridwise_gemm_pipeline.template Run(a_grid_desc_nchw, // global a_block1_desc_cslice_hslice_wslice, // lds1 write a_block1_desc_nhowo_cyx, // lds1 read a_block2_desc_ak0_m_ak1, // lds2 write/read @@ -1001,8 +931,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle a_grid_buf, a_block_buf1, a_block_buf2, - a_block1_slice_copy_steps, - a_block2_slice_copy_steps, + a_block_slice_copy_step, b_grid_desc_bk0_n_bk1, b_block_desc_bk0_n_bk1, b_blockwise_copy, @@ -1099,7 +1028,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle ThreadwiseTensorSliceTransfer_v1r3 // if(threadIdx.x == 0){ // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<4; ++k0) { + // for(int k0=0; k0<8; ++k0) { // for(int k1=0;k1<8;++k1) { // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block_buf[k0*8*128 + m*8 + k1])); // } @@ -102,16 +102,6 @@ struct GridwiseGemmPipeline_v1<1, true, true> a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - block_sync_lds(); - // if(threadIdx.x == 0){ - // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<4; ++k0) { - // for(int k1=0;k1<8;++k1) { - // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block_buf[k0*8*128 + m*8 + k1])); - // } - // } - // } - // } ++i; } while(i < (num_loop - 1)); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp index 6f9f11be6b..36fb73432b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp @@ -36,8 +36,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> typename AGridBuffer, typename ABlock1Buffer, typename ABlock2Buffer, - typename ABlock1TransferSteps, - typename ABlock2TransferSteps, + typename ABlockTransferStep, typename BGridDesc, typename BBlockDesc, typename BBlockTransfer, @@ -55,8 +54,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> [[maybe_unused]] const AGridBuffer& a_grid_buf, [[maybe_unused]] ABlock1Buffer& a_block1_buf, [[maybe_unused]] ABlock2Buffer& a_block2_buf, - [[maybe_unused]] const ABlock1TransferSteps& a_block1_copy_steps, - [[maybe_unused]] const ABlock2TransferSteps& a_block2_copy_steps, + [[maybe_unused]] const ABlockTransferStep& a_block_copy_step, [[maybe_unused]] const BGridDesc& b_grid_desc, [[maybe_unused]] const BBlockDesc& b_block_desc, [[maybe_unused]] BBlockTransfer& b_blockwise_copy, @@ -71,7 +69,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); - a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[0]); + a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); //block_sync_lds(); a_blockwise_global_to_lds1_copy.RunWrite(a_block1_desc_nchw_slice, a_block1_buf); @@ -85,11 +83,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf);//a_block2_buf); ?? jaki bufor a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf); //block_sync_lds(); - a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[0]); // if(threadIdx.x == 0) { - // printf("Next iter\n"); - // for(int c=0; c<5; ++c) { + // for(int c=0; c<4; ++c) { // for(int h=0; h<2;++h) { // for(int w=0; w<64;++w) { // printf("Ag[%d][%d][%d]=%f\n", c, h, w, static_cast(a_grid_buf[c * 64 * 2 + h * 64 + w])); @@ -97,9 +93,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> // } // } - // // // A[3][2][33]=0.000000 + // // A[3][2][33]=0.000000 - // for(int c=0; c<4; ++c) { + // for(int c=0; c<8; ++c) { // for(int h=0; h<4;++h) { // for(int w=0; w<64;++w) { // printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast(a_block1_buf[c * 64 * 4 + h * 64 + w])); @@ -108,14 +104,14 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> // } // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<4; ++k0) { + // for(int k0=0; k0<8; ++k0) { // for(int k1=0;k1<8;++k1) { // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block2_buf[k0*8*128 + m*8 + k1])); - // // int k = k0*8+k1; - // // int x = k%3; - // // int y = (k/3)%3; - // // int c = k/(9); + // int k = k0*8+k1; + // int x = k%3; + // int y = (k/3)%3; + // int c = k/(9); // // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka @@ -146,7 +142,6 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> do { // ask bartek how to structure this pipeline a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf); // A Global -> VGPR - block_sync_lds(); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); // B Global -> VGPR // @@ -156,53 +151,13 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true> block_sync_lds(); a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf); // A LDS1 -> LDS2 - a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[i+1]); - a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[i+1]); + a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + //block_sync_lds(); a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf); b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - // block_sync_lds(); // delete later - - // if(threadIdx.x == 0) { - // printf("Next iter\n"); - - // // // A[3][2][33]=0.000000 - - // for(int c=0; c<4; ++c) { - // for(int h=0; h<4;++h) { - // for(int w=0; w<64;++w) { - // printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast(a_block1_buf[c * 64 * 4 + h * 64 + w])); - // } - // } - // } - - // for(int m=0; m<128; ++m) { - // for(int k0=0; k0<4; ++k0) { - // for(int k1=0;k1<8;++k1) { - // printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast(a_block2_buf[k0*8*128 + m*8 + k1])); - - // // int k = k0*8+k1; - // // int x = k%3; - // // int y = (k/3)%3; - // // int c = k/(9); - - // // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka - - // // if(x+m%64 > 0 && x+m%64 < 127) { - // // auto lds1 = static_cast(a_block1_buf[c * 64 * 4 + (m/64 + y) * 64 + m%64 + x - 1]); - // // auto lds2 = static_cast(a_block2_buf[k0*8*128 + m*8 + k1]); - // // printf("lds2[%d][%d]:%f lds1[%d][%d][%d]:%f %s\n", m, k0*8+k1, lds2, c, y + m/64, m%64 + x - 1, lds1, (lds1 > lds2)? "diff" : ""); - - // // } - - // // if(lds1 > lds2) { - // // printf("diff lds1[%d][%d][%d]:%f lds2[%d][%d]:%f\n", c, y + m/64, m%64 + x, lds1, m, k0*8+k1 - 1, lds2); - // // } - // } - // } - // } - // } + //block_sync_lds(); ++i; } while(i < (num_loop - 1)); diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index fb6c0c4466..eb15d8eeca 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -1477,8 +1477,6 @@ struct TransformConvFwdToGemm } } -// NGCHW x KCYX -// NHoWoCYX KCYX = NHoWoK template , S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1>, - // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, - // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 2>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, // best + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, +// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // best - DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8> // best + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 64, 1, 4>, 8>, // best -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, -// DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8> + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8> // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp index 39c8837ec3..a3b9228508 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp @@ -127,7 +127,7 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple< //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // // generic instance - DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1> + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 0, 1, 1, S<1, 16, 1, 4>, 1> // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>, // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1>,