Revert "tmp save, trying processing full kperblock but it doesnt work"

This reverts commit 49a79b7ab4.
2026-07-02 21:27:45 +00:00 · 2025-05-23 12:46:37 +00:00
parent 49a79b7ab4
commit a503b3c237
8 changed files with 98 additions and 231 deletions
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
@@ -11,8 +11,8 @@ struct StreamConfig
    hipStream_t stream_id_ = nullptr;
    bool time_kernel_      = false;
    int log_level_         = 0;
-    int cold_niters_       = 0;//5; 
-    int nrepeat_           = 1;//50;
+    int cold_niters_       = 5;
+    int nrepeat_           = 50;

    bool flush_cache   = false;
    int rotating_count = 1;
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_preshuffle_multiple_abd_xdl_cshuffle.hpp
@@ -1264,7 +1264,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;

-        const index_t W = arg.a_g_n_c_wis_lengths_[I4]; // assume 2d for now
        const index_t G = arg.b_g_k_c_xs_lengths_[I0];
        const index_t K = arg.b_g_k_c_xs_lengths_[I1];
        const index_t C = arg.b_g_k_c_xs_lengths_[I2];
@@ -1352,7 +1351,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
                return false;
            }
        }
-        //printf("checking a vector \n");

        // check vector access of A
        // FIXME: layout
@@ -1364,7 +1362,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
                     is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
        {
            // Check access per C
-            if(!(ABlockTransferSrcVectorDim == 2 && W % ABlockTransferSrcScalarPerVector == 0)) // changed from C % to H%
+            if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
            {
                // If not possible, check access per G
                if(!(ABlockTransferSrcVectorDim == 1 && (C == 1 || NumGroupsToMerge == 1) &&
@@ -1373,17 +1371,15 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
                      is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>()) &&
                     G % ABlockTransferSrcScalarPerVector == 0))
                {
-                    printf("checking a vector failed \n");
                    return false;
                }
            }
        }
        else
        {
-            printf("checking a vector wtf \n");
            return false;
        }
-        // printf("checking b vector \n");
+
        // check vector access of B
        // FIXME: layout
        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
@@ -1452,30 +1448,30 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
        if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
                     is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
        {
-            // if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
-            // {
-            //     return false;
-            // }
+            if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }

-            // if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
-            // {
-            //     return false;
-            // }
+            if((G * K) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }

-            // const index_t input_spatial_acum = ck::accumulate_n<index_t>(
-            //     arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
-            // const index_t output_spatial_acum = ck::accumulate_n<index_t>(
-            //     arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+            const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+                arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+            const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+                arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());

-            // if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
-            // {
-            //     return false;
-            // }
+            if(input_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }

-            // if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
-            // {
-            //     return false;
-            // }
+            if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+            {
+                return false;
+            }

            // if(!arg.p_workspace_)
            // {
@@ -1501,7 +1497,7 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
        {
            return false;
        }
-        // printf("checking e vector \n");
+
        // check vector access of E
        if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
                     is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
@@ -1536,7 +1532,6 @@ struct DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle
        }
        else
        {
-            //printf("gonig into gemm validation\n");
            return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
                                               arg.b_grid_desc_n_k_,
                                               arg.ds_grid_desc_m_n_,
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_conv_preshuffle_multiple_d_xdl_cshuffle.hpp
@@ -162,19 +162,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
            make_tuple(NSlice, PaddedCSlice, HSlice, WSlice),
            make_tuple(PaddedCSlice * HSlice * WSlice, HSlice * WSlice, WSlice, I1));

-        // constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
-        //             nchw_slice_desc,
-        //             make_tuple(make_pass_through_transform(NSlice),
-        //                        make_slice_transform(PaddedCSlice, I0, CSlice),
-        //                        make_pass_through_transform(HSlice),
-        //                        make_pass_through_transform(WSlice)),
-        //             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-        //             make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
+        constexpr auto nchw_slice_sliced_desc = transform_tensor_descriptor(
                    nchw_slice_desc,
                    make_tuple(make_pass_through_transform(NSlice),
-                               make_pass_through_transform(PaddedCSlice),
+                               make_slice_transform(PaddedCSlice, I0, CSlice),
+                               make_pass_through_transform(HSlice),
+                               make_pass_through_transform(WSlice)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        constexpr auto nchw_padded_slice_desc = transform_tensor_descriptor(
+                    nchw_slice_sliced_desc,
+                    make_tuple(make_pass_through_transform(NSlice),
+                               make_pass_through_transform(CSlice),
                               make_pass_through_transform(HSlice),
                               make_pad_transform(WSlice, InLeftPadW, InRightPadW)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
@@ -183,7 +183,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
        constexpr auto n_c_y_ho_x_wo_desc = transform_tensor_descriptor(
                    nchw_padded_slice_desc,
                    make_tuple(make_pass_through_transform(NSlice),
-                               make_pass_through_transform(PaddedCSlice),
+                               make_pass_through_transform(CSlice),
                               make_embed_transform(make_tuple(Y, HoutSlice),
                                                    make_tuple(ConvDilationH, ConvStrideH)),
                               make_embed_transform(make_tuple(X, WoutSlice),
@@ -196,26 +196,19 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
        constexpr auto mk_desc = transform_tensor_descriptor(
                    n_c_y_ho_x_wo_desc,
                    make_tuple(make_merge_transform(make_tuple(NSlice, HoutSlice, WoutSlice)),
-                               make_merge_transform(make_tuple(PaddedCSlice, Y, X))),
+                               make_merge_transform(make_tuple(CSlice, Y, X))),
                    make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

-        constexpr auto mk_desc_sliced = transform_tensor_descriptor(
+        constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
+
+        constexpr auto mk_pad_desc = transform_tensor_descriptor(
                    mk_desc,
                    make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
-                               make_slice_transform(PaddedCSlice * Y * X, I0, KPerBlock)),
+                               make_right_pad_transform(CSlice * Y * X, FinalMKPad)),
                    make_tuple(Sequence<0>{}, Sequence<1>{}),
                    make_tuple(Sequence<0>{}, Sequence<1>{}));

-        // constexpr auto FinalMKPad = KPerBlock - TrueKPerBlock;
-
-        // constexpr auto mk_pad_desc = transform_tensor_descriptor(
-        //             mk_desc,
-        //             make_tuple(make_pass_through_transform(NSlice * HoutSlice * WoutSlice),
-        //                        make_right_pad_transform(PaddedCSlice * Y * X, FinalMKPad)),
-        //             make_tuple(Sequence<0>{}, Sequence<1>{}),
-        //             make_tuple(Sequence<0>{}, Sequence<1>{}));
-
        // if(threadIdx.x == 0 && blockIdx.x == 0) {
        //     printf("nchw pad slice%d\n", nchw_padded_slice_desc.GetElementSize());
        //     printf("nhowocyx %d\n", nhowocyx_desc.GetElementSize());
@@ -225,7 +218,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle

        constexpr auto AK0 = KPerBlock / AK1;

-        return transform_tensor_descriptor(mk_desc_sliced,
+        return transform_tensor_descriptor(mk_pad_desc,
                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
                                                      make_pass_through_transform(MPerBlock)),
                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
@@ -240,16 +233,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
    }

-    // __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1_padded()
-    // {
-    //     // B matrix in LDS memory, dst of blockwise copy
-    //     //constexpr BK0 = PaddedCSlice * X * Y; 
-
-    //     return make_naive_tensor_descriptor(
-    //         make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
-    //         make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
-    // }
-
    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
    {
        // B matrix in LDS memory, dst of blockwise copy
@@ -533,7 +516,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle

    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
    {
-        const index_t num_loop = K / KPerBlock;
+        const index_t num_loop = K / TrueKPerBlock;

        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
    }
@@ -637,27 +620,6 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle

    __device__ __host__ static constexpr auto GetMPerBlock() { return MPerBlock; }

-    template <typename AGridDesc_NCHW>
-    __host__ __device__ static auto GetAGridDescriptor_NCHW_BlockSlice(const AGridDesc_NCHW& a_grid_desc_nchw, const index_t& m_block_idx) 
-    {
-
-        
-        const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);
-
-        //const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) / Hout);
-        //const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((m_block_idx * HoutSlice) % Hout);
-
-        return transform_tensor_descriptor(
-                    a_grid_desc_nchw,
-                    make_tuple(make_pass_through_transform(a_grid_desc_nchw.GetLength(I0)),
-                               make_pass_through_transform(a_grid_desc_nchw.GetLength(I1)),
-                               make_slice_transform(a_grid_desc_nchw.GetLength(I2), (m_block_idx * HoutSlice) % Hout, HSlice),
-                               make_pass_through_transform(a_grid_desc_nchw.GetLength(I3))),
-
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-    }
-
    template <bool HasMainKBlockLoop,
              typename AGridDesc_AK0_M_AK1,
              typename AGridDesc_NCHW,
@@ -718,13 +680,11 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
        const index_t Hout = PadH ? a_grid_desc_nchw.GetLength(I2) - Y + 1 : a_grid_desc_nchw.GetLength(I2);

        const index_t m_block_data_idx_on_grid_nchw_n = __builtin_amdgcn_readfirstlane((block_work_idx[I0]  * HoutSlice) / Hout);
-        //const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);
+        const index_t m_block_data_idx_on_grid_nchw_h = __builtin_amdgcn_readfirstlane((block_work_idx[I0] * HoutSlice) % Hout);

        const index_t n_block_data_idx_on_grid =
            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);

-        const auto a_grid_desc_nchw_block_slice = GetAGridDescriptor_NCHW_BlockSlice(a_grid_desc_nchw, block_work_idx[I0]);
-
        // if(threadIdx.x == 0) {
        //     printf("BlockIdx.x %d, mblock idx %d n %d h%d nblock idx %d Hout %d", blockIdx.x, block_work_idx[I0],
        //          m_block_data_idx_on_grid_nchw_n, m_block_data_idx_on_grid_nchw_h, n_block_data_idx_on_grid, Hout);
@@ -769,7 +729,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
                                                Sequence<0, 1, 2, 3>,//ABlockTransferThreadClusterArrangeOrder,
                                                ADataType,
                                                AComputeDataType,
-                                                decltype(a_grid_desc_nchw_block_slice),
+                                                decltype(a_grid_desc_nchw),
                                                decltype(a_block1_desc_cslice_hslice_wslice),
                                                Sequence<0, 1, 2, 3>,//ABlockTransferSrcAccessOrder,
                                                Sequence<0, 1, 2, 3>,//Sequence<1, 0, 2>,
@@ -782,8 +742,8 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
                                                AThreadTransferSrcResetCoordinateAfterRun,
                                                true,
                                                NumGemmKPrefetchStage>(
-                a_grid_desc_nchw_block_slice,
-                make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, 0, 0), // H slice is done inside of grid desc
+                a_grid_desc_nchw,
+                make_multi_index(m_block_data_idx_on_grid_nchw_n, 0, m_block_data_idx_on_grid_nchw_h, 0),
                a_element_op,
                a_block1_desc_cslice_hslice_wslice,
                make_multi_index(0, 0, 0, 0),
@@ -810,7 +770,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
                                                ABlockTransferDstScalarPerVector_AK1,//ABlockTransferDstScalarPerVector_AK1,
                                                1,
                                                1,
-                                                false,// was triue nad working before changes//AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,//AThreadTransferSrcResetCoordinateAfterRun,
                                                true,
                                                NumGemmKPrefetchStage>(
                a_block1_desc_nhowo_cyx,
@@ -944,40 +904,9 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
            b_block_desc_bk0_n_bk1.GetElementSpaceSize());

        //constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
-        //constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0); // make multiple steps
-        // template <index_t N>
-        // using MultiIndex = StaticallyIndexedArray<index_t, N>;
-        [[maybe_unused]] StaticallyIndexedArray<index_t, 4> a_block1_slice_copy_steps[9] = {
-            make_multi_index(0, CSlice, 1, 0),
-            make_multi_index(0, CSlice, 2, 0),
-            make_multi_index(0, CSlice, 2, 0),
-            make_multi_index(0, CSlice, 1, 0),
-            make_multi_index(0, CSlice, 2, 0),
-            make_multi_index(0, CSlice, 2, 0),
-            make_multi_index(0, CSlice, 1, 0),
-            make_multi_index(0, CSlice, 2, 0),
-            make_multi_index(0, CSlice, 2, 0)
-        };
-
-        [[maybe_unused]] StaticallyIndexedArray<index_t, 3> a_block2_slice_copy_steps[9] = {
-            make_multi_index(0, 0, 2), // 2
-            make_multi_index(0, 0, -1), // -1
-            make_multi_index(0, 0, -1), // -1
-            make_multi_index(0, 0, 2), // 2
-            make_multi_index(0, 0, -1), // -1
-            make_multi_index(0, 0, -1), // -1
-            make_multi_index(0, 0, 2), // 2
-            make_multi_index(0, 0, -1), // -1
-            make_multi_index(0, 0, -1) // -1
-        };
-        // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 1, 0); // make multiple steps
-        // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
-        // constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 2, 0); // make multiple steps
-     //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 2); // as Y % mod(KPerBlock - CSlice*Y*X)
-     //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 1); // 
-     //constexpr auto a_block2_slice_copy_step = make_multi_index(0, 0, 0, 0);
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, CSlice, 0, 0);
        //constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(TrueKPerBlock / BK1, 0, TrueKPerBlock % BK1);

        // gridwise GEMM pipeline
        const auto gridwise_gemm_pipeline =
@@ -985,14 +914,15 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle

        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
-            KPerBlock); // TrueKPerBlock ?
+            TrueKPerBlock) + ((a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) %
+            TrueKPerBlock != 0); // TrueKPerBlock ?

-        if(threadIdx.x == 0 && blockIdx.x == 0) {
-            printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
-            printf("Running %d k block iters\n", num_k_block_main_loop);
-        }
+        // if(threadIdx.x == 0) {
+        //     printf("hasmainkblockloop: %d\n", HasMainKBlockLoop);
+        //     printf("Running %d k block iters", num_k_block_main_loop);
+        // }

-        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw_block_slice, // global
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_nchw, // global
                                                               a_block1_desc_cslice_hslice_wslice, // lds1 write
                                                               a_block1_desc_nhowo_cyx, // lds1 read
                                                               a_block2_desc_ak0_m_ak1, // lds2 write/read
@@ -1001,8 +931,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
                                                               a_grid_buf,
                                                               a_block_buf1,
                                                               a_block_buf2,
-                                                               a_block1_slice_copy_steps,
-                                                               a_block2_slice_copy_steps,
+                                                               a_block_slice_copy_step,
                                                               b_grid_desc_bk0_n_bk1,
                                                               b_block_desc_bk0_n_bk1,
                                                               b_blockwise_copy,
@@ -1099,7 +1028,7 @@ struct GridwiseGemmConvFwdPreshuffleMultipleD_xdl_cshuffle
                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
                                                   CShuffleDataType,
                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2), // MN
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
                                                   ck::tensor_operation::element_wise::PassThrough,
                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
                                                            CShuffleNXdlPerWavePerShuffle,
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -72,7 +72,7 @@ struct GridwiseGemmPipeline_v1<1, true, true>

        // if(threadIdx.x == 0){
        //     for(int m=0; m<128; ++m) {
-        //         for(int k0=0; k0<4; ++k0) {
+        //         for(int k0=0; k0<8; ++k0) {
        //             for(int k1=0;k1<8;++k1) {
        //                 printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
        //             }
@@ -102,16 +102,6 @@ struct GridwiseGemmPipeline_v1<1, true, true>

                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-                block_sync_lds();
-                // if(threadIdx.x == 0){
-                //     for(int m=0; m<128; ++m) {
-                //         for(int k0=0; k0<4; ++k0) {
-                //             for(int k1=0;k1<8;++k1) {
-                //                 printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block_buf[k0*8*128 + m*8 + k1]));
-                //             }
-                //         }
-                //     }
-                // }

                ++i;
            } while(i < (num_loop - 1));
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1_nchw.hpp
@@ -36,8 +36,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
              typename AGridBuffer,
              typename ABlock1Buffer,
              typename ABlock2Buffer,
-              typename ABlock1TransferSteps,
-              typename ABlock2TransferSteps,
+              typename ABlockTransferStep,
              typename BGridDesc,
              typename BBlockDesc,
              typename BBlockTransfer,
@@ -55,8 +54,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
                               [[maybe_unused]] const AGridBuffer& a_grid_buf,
                               [[maybe_unused]] ABlock1Buffer& a_block1_buf,
                               [[maybe_unused]] ABlock2Buffer& a_block2_buf,
-                               [[maybe_unused]] const ABlock1TransferSteps& a_block1_copy_steps,
-                               [[maybe_unused]] const ABlock2TransferSteps& a_block2_copy_steps,
+                               [[maybe_unused]] const ABlockTransferStep& a_block_copy_step,
                               [[maybe_unused]] const BGridDesc& b_grid_desc,
                               [[maybe_unused]] const BBlockDesc& b_block_desc,
                               [[maybe_unused]] BBlockTransfer& b_blockwise_copy,
@@ -71,7 +69,7 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
        a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf);
        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

-        a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[0]);
+        a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
        //block_sync_lds();
        a_blockwise_global_to_lds1_copy.RunWrite(a_block1_desc_nchw_slice, a_block1_buf);
@@ -85,11 +83,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
        a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf);//a_block2_buf); ?? jaki bufor
        a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
        //block_sync_lds();
-        a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[0]);

        // if(threadIdx.x == 0) {
-        //     printf("Next iter\n");
-        //     for(int c=0; c<5; ++c) {
+        //     for(int c=0; c<4; ++c) {
        //         for(int h=0; h<2;++h) {
        //             for(int w=0; w<64;++w) {
        //                 printf("Ag[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_grid_buf[c * 64 * 2 + h * 64 + w]));
@@ -97,9 +93,9 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
        //         }
        //     }

-        // //        // A[3][2][33]=0.000000
+        //        // A[3][2][33]=0.000000

-        //     for(int c=0; c<4; ++c) {
+        //     for(int c=0; c<8; ++c) {
        //         for(int h=0; h<4;++h) {
        //             for(int w=0; w<64;++w) {
        //                 printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
@@ -108,14 +104,14 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
        //     }

        //     for(int m=0; m<128; ++m) {
-        //         for(int k0=0; k0<4; ++k0) {
+        //         for(int k0=0; k0<8; ++k0) {
        //             for(int k1=0;k1<8;++k1) {
        //                 printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
                    
-        //                 // int k = k0*8+k1;
-        //                 // int x = k%3;
-        //                 // int y = (k/3)%3;
-        //                 // int c = k/(9);
+        //                 int k = k0*8+k1;
+        //                 int x = k%3;
+        //                 int y = (k/3)%3;
+        //                 int c = k/(9);

        //                 // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka

@@ -146,7 +142,6 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
            do
            { // ask bartek how to structure this pipeline
                a_blockwise_global_to_lds1_copy.RunRead(a_grid_desc, a_grid_buf);  // A Global -> VGPR
-                block_sync_lds();
                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);                 // B Global -> VGPR
                //

@@ -156,53 +151,13 @@ struct GridwiseGemmPipeline_v1_nchw<1, true, true>
                block_sync_lds();
                a_blockwise_lds1_to_lds2_copy.RunRead(a_block1_desc_nhowo_cyx, a_block1_buf); // A LDS1 -> LDS2

-                a_blockwise_lds1_to_lds2_copy.MoveSrcSliceWindow(a_block1_desc_nhowo_cyx, a_block2_copy_steps[i+1]);
-                a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block1_copy_steps[i+1]);
+                a_blockwise_global_to_lds1_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                //block_sync_lds();

                a_blockwise_lds1_to_lds2_copy.RunWrite(a_block2_desc_ak0_m_k1, a_block2_buf);
                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
-                // block_sync_lds(); // delete later
-
-        // if(threadIdx.x == 0) {
-        //     printf("Next iter\n");
-
-        // //        // A[3][2][33]=0.000000
-
-        //     for(int c=0; c<4; ++c) {
-        //         for(int h=0; h<4;++h) {
-        //             for(int w=0; w<64;++w) {
-        //                 printf("A[%d][%d][%d]=%f\n", c, h, w, static_cast<float>(a_block1_buf[c * 64 * 4 + h * 64 + w]));
-        //             }         
-        //         }
-        //     }
-
-        //     for(int m=0; m<128; ++m) {
-        //         for(int k0=0; k0<4; ++k0) {
-        //             for(int k1=0;k1<8;++k1) {
-        //                 printf("A[%d][%d]=%f\n", m, k0*8+k1, static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]));
-                    
-        //                 // int k = k0*8+k1;
-        //                 // int x = k%3;
-        //                 // int y = (k/3)%3;
-        //                 // int c = k/(9);
-
-        //                 // A[m][k] = A[c, y+m/64,x+m%64] z oryginalnego obrazka
-
-        //                 // if(x+m%64 > 0 && x+m%64 < 127) {
-        //                 //     auto lds1 = static_cast<float>(a_block1_buf[c * 64 * 4 + (m/64 + y) * 64 + m%64 + x - 1]);
-        //                 //     auto lds2 = static_cast<float>(a_block2_buf[k0*8*128 + m*8 + k1]);
-        //                 //     printf("lds2[%d][%d]:%f lds1[%d][%d][%d]:%f %s\n", m, k0*8+k1, lds2, c, y + m/64, m%64 + x - 1, lds1, (lds1 > lds2)? "diff" : "");
-
-        //                 // }
-
-        //                     // if(lds1 > lds2) {
-        //                     //     printf("diff lds1[%d][%d][%d]:%f lds2[%d][%d]:%f\n", c, y + m/64, m%64 + x, lds1, m, k0*8+k1 - 1, lds2);
-        //                     // }
-        //             }
-        //         }
-        //     }
-        // }
+                //block_sync_lds();

                ++i;
            } while(i < (num_loop - 1));
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -1477,8 +1477,6 @@ struct TransformConvFwdToGemm
        }
    }

-// NGCHW x KCYX 
-// NHoWoCYX KCYX = NHoWoK

    template <typename CLayout,
              index_t NDimSp = NDimSpatial,
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_preshuffle_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_preshuffle_xdl_instance.hpp
@@ -59,35 +59,35 @@ using device_grouped_conv_fwd_preshuffle_xdl_f16_generic_instances = std::tuple<
  //####################################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave|      |      |  |  |     |     |    Lengths_N_C_H_W| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
  //####################################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |      |      |  |  |     |     |                   |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
  // generic instance
-  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    32,   8,   8,   16,   16,    8,    4,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         0,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         0,           1,           1,               S<1, 16, 1, 4>,               1>,
-  // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    64,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 8, 4, 2>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-  // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    32,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-  // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    32,   8,   8,   32,   32,    4,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-  // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    32,   8,   8,   16,   16,    8,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-  // DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    64,   8,   8,   16,   16,    8,    2,     4,    64, 3, 3, true, true,      S<1, 8, 4, 2>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               2>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    64,   8,   8,   16,   16,    8,    4,     4,    64, 3, 3, true, true,      S<1, 8, 4, 2>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    64,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 8, 4, 2>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    64,    32,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    32,   8,   8,   32,   32,    4,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    32,   8,   8,   16,   16,    8,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   128,    32,    64,   8,   8,   16,   16,    8,    2,     4,    64, 3, 3, true, true,      S<1, 8, 4, 2>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,

 //   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
 //   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   16,   16,    4,    2,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   32,   32,    2,    1,     3,    64, 3, 3, true, true,      S<1, 8, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   16,   16,    4,    2,     3,    64, 3, 3, true, true,      S<1, 8, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               1>,  
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-// //   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     6,    64, 3, 3, true, true,      S<1, 4, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
-  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               1>, // best
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   32,   32,    2,    1,     3,    64, 3, 3, true, true,      S<1, 8, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    64,   8,   8,   16,   16,    4,    2,     3,    64, 3, 3, true, true,      S<1, 8, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               1>,  
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     6,    64, 3, 3, true, true,      S<1, 4, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>, // best
  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8>, // best

  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    64,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 8, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>, // best
-  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    64,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 8, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8> // best
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,   128,    64,    64,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 8, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,               8>, // best

-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-//   DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     3,    64, 3, 3, true, true,      S<1, 4, 1, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+  DeviceGroupedConvFwdPreshuffleMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     4,    64, 3, 3, true, true,      S<1, 4, 4, 4>,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>


  // clang-format on
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -127,7 +127,7 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
        // // generic instance
-        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    128,    64,    32,   8,   8,   16,   16,    8,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         0,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         0,           1,           1,               S<1, 16, 1, 4>,               1>
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,    128,    64,    64,   8,   8,   16,   16,    8,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         0,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         0,           1,           1,               S<1, 16, 1, 4>,               1>
        // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   112,    64,    64,   8,   8,   16,   16,    7,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
        // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   112,    64,    64,   8,   8,   16,   16,    7,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,
        // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,        1,    64,   112,    64,    64,   8,   8,   16,   16,    7,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1>,