added ThreadwiseGenericTensorSliceCopy_v2r1

2026-05-11 08:50:17 +00:00 · 2019-08-08 02:42:52 -05:00
parent 701b7341f0
commit a9b2b1dcd7
8 changed files with 457 additions and 29 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -47,7 +47,8 @@ template <index_t GridSize,
          class WeiBlockCopySrcAccessOrder,
          class WeiBlockCopyDstAccessOrder,
          index_t WeiBlockCopySrcDataPerRead_E,
-          index_t WeiBlockCopyDstDataPerWrite_K>
+          index_t WeiBlockCopyDstDataPerWrite_K,
+          index_t OutThreadCopyDataPerAccess_W>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
 {
    __device__ void Run(const Float* const __restrict__ p_in_global,
@@ -155,7 +156,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
        static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not satisfied");

-#if 1
+#if 0
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
@@ -184,7 +185,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
            InBlockCopySubLengths_E_N1_B_N2,
            InBlockCopyClusterLengths_E_N1_B_N2,
-            InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            2,
+            3,
+            InBlockCopySrcDataPerRead_B,
+            InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
 #endif

        // weight tensor
@@ -198,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            Sequence<EPerBlock, KPerBlock>{},
            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});

-#if 1
+#if 0
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
@@ -227,7 +234,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            decltype(wei_e_k_block_desc.GetLengths()),
            WeiBlockCopySubLengths_E_K,
            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            0,
+            1,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0});
 #endif

        // GEMM definition
@@ -322,7 +335,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
                Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];

-#if 1
+#if 0
                blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
                // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
                // True);
@@ -354,7 +367,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
            Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
            Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];

-#if 1
+#if 0
            blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
            // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
            p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
@@ -434,7 +447,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                    k_thread_data_on_global, 0, b_thread_data_on_global, 0);

-#if 1
+#if 0
            ThreadwiseGenericTensorSliceCopy_v1r2<
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
@@ -445,12 +458,18 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
                .Run(p_out_thread, p_out_thread_on_global);
 #elif 1
-            ThreadwiseGenericTensorSliceCopy_v2<
+            ThreadwiseGenericTensorSliceCopy_v2r1<
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
                NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
                MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
+                arithmetic_sequence_gen<0, 8, 1>::type,
+                arithmetic_sequence_gen<0, 8, 1>::type,
+                7,
+                7,
+                1,
+                1>(
                {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
                .Run(p_out_thread, p_out_thread_on_global);
 #endif
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -44,7 +44,8 @@ template <index_t GridSize,
          class WeiBlockCopySrcAccessOrder,
          class WeiBlockCopyDstAccessOrder,
          index_t WeiBlockCopySrcDataPerRead_E,
-          index_t WeiBlockCopyDstDataPerWrite_K>
+          index_t WeiBlockCopyDstDataPerWrite_K,
+          index_t OutThreadCopyDataPerAccess_B>
 struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 {
    __device__ void Run(const Float* const __restrict__ p_in_global,
@@ -138,7 +139,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
                                               decltype(in_e_b_block_desc.GetLengths()),
                                               InBlockCopySubLengths_E_B,
                                               InBlockCopyClusterLengths_E_B,
-                                               InBlockCopyThreadClusterArrangeOrder>(
+                                               InBlockCopyThreadClusterArrangeOrder,
+                                               InBlockCopySrcAccessOrder,
+                                               InBlockCopyDstAccessOrder,
+                                               1,
+                                               1,
+                                               InBlockCopyDataPerAccess_B,
+                                               InBlockCopyDataPerAccess_B>(
                {0, b_block_data_on_global}, {0, 0});

        // weight tensor
@@ -164,7 +171,13 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
            decltype(wei_e_k_block_desc.GetLengths()),
            WeiBlockCopySubLengths_E_K,
            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            0,
+            1,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K>({0, k_block_data_on_global}, {0, 0});

        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx
@@ -349,15 +362,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
            using OutThreadCopySliceLengths =
                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>;

-            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
+            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2r1<
                decltype(out_k0_k1_b_thread_desc),
                decltype(out_k0_k1_b_global_desc),
                NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
                MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
-                OutThreadCopySliceLengths>({0, 0, 0},
-                                           {k_thread_data_on_global / K1,
-                                            k_thread_data_on_global % K1,
-                                            b_thread_data_on_global});
+                OutThreadCopySliceLengths,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                arithmetic_sequence_gen<0, 3, 1>::type,
+                2,
+                2,
+                OutThreadCopyDataPerAccess_B,
+                OutThreadCopyDataPerAccess_B>({0, 0, 0},
+                                              {k_thread_data_on_global / K1,
+                                               k_thread_data_on_global % K1,
+                                               b_thread_data_on_global});

            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
            {
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -412,7 +412,13 @@ template <index_t BlockSize,
          class SliceLengths,
          class SubLengths,
          class ThreadClusterLengths,
-          class ThreadClusterArrangeOrder>
+          class ThreadClusterArrangeOrder,
+          class SrcDimAccessOrder,
+          class DstDimAccessOrder,
+          index_t SrcVectorAccessDim,
+          index_t DstVectorAccessDim,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
 struct BlockwiseGenericTensorSliceCopy_v2
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
@@ -496,6 +502,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
    private:
    using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));

+#if 0
    using ThreadwiseLoad =
        ThreadwiseGenericTensorSliceCopy_v2<SrcDesc,
                                            RegisterBufferDesc,
@@ -509,6 +516,33 @@ struct BlockwiseGenericTensorSliceCopy_v2
                                            NormalTensorCoordinate<RegisterBufferDesc>,
                                            DstCoordinate,
                                            SubLengths>;
+#else
+    using ThreadwiseLoad =
+        ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
+                                              RegisterBufferDesc,
+                                              SrcCoordinate,
+                                              NormalTensorCoordinate<RegisterBufferDesc>,
+                                              SubLengths,
+                                              SrcDimAccessOrder,
+                                              SrcDimAccessOrder,
+                                              SrcVectorAccessDim,
+                                              SrcVectorAccessDim,
+                                              SrcDataPerAccess,
+                                              1>;
+
+    using ThreadwiseStore =
+        ThreadwiseGenericTensorSliceCopy_v2r1<RegisterBufferDesc,
+                                              DstDesc,
+                                              NormalTensorCoordinate<RegisterBufferDesc>,
+                                              DstCoordinate,
+                                              SubLengths,
+                                              DstDimAccessOrder,
+                                              DstDimAccessOrder,
+                                              DstVectorAccessDim,
+                                              DstVectorAccessDim,
+                                              1,
+                                              DstDataPerAccess>;
+#endif
    ThreadwiseLoad mThreadwiseLoad;
    ThreadwiseStore mThreadwiseStore;
 };
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -18,6 +18,10 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
 #endif

+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#endif
+
 namespace ck {

 // This threadwise copy allow vector access of src and dst.
@@ -590,5 +594,313 @@ struct ThreadwiseGenericTensorSliceCopy_v2
    DstCoordinate mDstSliceOrigin;
 };

+#if 1
+// This threadwise copy allow vector access of src and dst.
+// It allows the dimensions of vector access to be different on src and dst.
+// It also allows the vector size to be different on src and dst.
+// It also allows order of access to be different on src and dst.
+// It use register as buffer to hold all data moving from src to dst.
+// It is designed for copying small amount of data, and src and dst are
+// device memory or LDS.
+// When copying large amout of data, let's hope compiler will reduce register
+// used for the buffer.
+template <class SrcDesc,
+          class DstDesc,
+          class SrcCoordinate,
+          class DstCoordinate,
+          class SliceLengths,
+          class SrcDimAccessOrder,
+          class DstDimAccessOrder,
+          index_t SrcVectorAccessDim,
+          index_t DstVectorAccessDim,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct ThreadwiseGenericTensorSliceCopy_v2r1
+{
+    static constexpr index_t nDim = SliceLengths::GetSize();
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1(SrcCoordinate src_slice_origin,
+                                                               DstCoordinate dst_slice_origin)
+        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
+    {
+    }
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1()
+        : ThreadwiseGenericTensorSliceCopy_v2r1(make_zero_array<index_t, nDim>(),
+                                                make_zero_array<index_t, nDim>())
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
+    {
+        mSrcSliceOrigin = src_slice_origin;
+    }
+
+    __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
+    {
+        mDstSliceOrigin = dst_slice_origin;
+    }
+
+    template <class TDesc, class Lengths>
+    struct IsolateMergedDimLengths
+    {
+        template <class IDim>
+        __device__ constexpr index_t operator()(IDim idim) const
+        {
+            return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
+        }
+    };
+
+    template <class TData>
+    __device__ void Run(const TData* p_src, TData* p_dst) const
+    {
+        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
+
+        TData p_buffer_[buffer_desc.GetElementSpace()];
+        TData* p_buffer = p_buffer_;
+
+        // copy data from src into buffer
+        {
+            using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+
+            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
+            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
+
+            constexpr auto src_access_lengths = SliceLengths::Modify(
+                src_vector_access_dim,
+                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
+
+            // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
+            // normal dimensions is known at compile time.
+            // Below is a hack to isolate merged dimension id from normal dimension id, so the
+            // corresponding offset can be calculated seperately at run-time and compile-time.
+            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
+            // merged dimensions, and has value = 1 on normal dimensions;
+            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
+            // normal dimensions, and has value = 1 on merged dimensions;
+            constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
+                nDim,
+                IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
+
+            constexpr auto src_normal_dim_access_lengths =
+                src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
+
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
+            // offset w.r.t. merged dimension need to be computed at run-time
+            static_ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                auto src_merged_dim_access_id_) {
+
+                constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){};
+
+                constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify(
+                    src_vector_access_dim,
+                    src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access);
+
+                const TData* p_src_tmp =
+                    p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
+
+                // offset w.r.t. normal dimension can be computed at compile-time
+                static_ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                    auto src_normal_dim_access_id_) {
+
+                    constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){};
+
+                    constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify(
+                        src_vector_access_dim,
+                        src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access);
+
+                    constexpr index_t src_normal_offset =
+                        SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
+
+                    // load vector from src
+                    const src_vector_t vector_data =
+                        *reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
+
+                    // unpack vector into buffer
+                    static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
+                        constexpr auto scalar_id =
+                            typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
+                                src_vector_access_dim, i);
+
+                        constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                            src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
+
+                        constexpr index_t buffer_offset =
+                            buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
+
+                        p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
+                    });
+                });
+            });
+#else
+            ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                auto src_merged_dim_access_id) {
+
+                auto src_merged_dim_data_id = src_merged_dim_access_id;
+                src_merged_dim_data_id(src_vector_access_dim) =
+                    src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
+
+                const TData* p_src_tmp =
+                    p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
+
+                // these should be compile-time known
+                ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}([&](
+                    auto src_normal_dim_access_id) {
+
+                    auto src_normal_dim_data_id = src_normal_dim_access_id;
+                    src_normal_dim_data_id(src_vector_access_dim) =
+                        src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access;
+
+                    const index_t src_normal_offset =
+                        SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
+
+                    // load vector from src
+                    const src_vector_t vector_data =
+                        *reinterpret_cast<const src_vector_t*>(&p_src_tmp[src_normal_offset]);
+
+                    // unpack vector into buffer
+                    for(index_t i = 0; i < SrcDataPerAccess; ++i)
+                    {
+                        auto scalar_id                   = make_zero_array<index_t, nDim>();
+                        scalar_id(src_vector_access_dim) = i;
+
+                        const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                            src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
+
+                        p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
+                    }
+                });
+            });
+#endif
+        }
+
+        // copy data from buffer into dst
+        {
+            using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
+            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
+
+            constexpr auto dst_access_lengths = SliceLengths::Modify(
+                dst_vector_access_dim,
+                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
+
+            constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
+                nDim,
+                IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
+
+            constexpr auto dst_normal_dim_access_lengths =
+                dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
+
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
+            // offset w.r.t. merged dimension need to be computed at run-time
+            static_ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
+                auto dst_merged_dim_access_id_) {
+
+                constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){};
+
+                constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify(
+                    dst_vector_access_dim,
+                    dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
+
+                TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
+
+                // offset w.r.t. normal dimension can be computed at compile-time
+                static_ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
+                    auto dst_normal_dim_access_id_) {
+                    constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){};
+
+                    constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify(
+                        dst_vector_access_dim,
+                        dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access);
+
+                    dst_vector_t vector_data;
+
+                    // pack vector from buffer
+                    static_for<0, DstDataPerAccess, 1>{}([&](auto i) {
+                        constexpr auto scalar_id =
+                            typename uniform_sequence_gen<nDim, 0>::type{}.Modify(
+                                dst_vector_access_dim, i);
+
+                        constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                            dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
+
+                        reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
+                    });
+
+                    constexpr index_t dst_normal_offset =
+                        DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
+
+                    // write vector into dst
+                    *reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
+                });
+            });
+#else
+            // offset w.r.t. merged dimension need to be computed at run-time
+            ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}([&](
+                auto dst_merged_dim_access_id) {
+
+                auto dst_merged_dim_data_id = dst_merged_dim_access_id;
+                dst_merged_dim_data_id(dst_vector_access_dim) =
+                    dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
+
+                TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
+
+                // offset w.r.t. normal dimension can be computed at compile-time
+                ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}([&](
+                    auto dst_normal_dim_access_id) {
+
+                    auto dst_normal_dim_data_id = dst_normal_dim_access_id;
+                    dst_normal_dim_data_id(dst_vector_access_dim) =
+                        dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
+
+                    dst_vector_t vector_data;
+
+                    // pack vector from buffer
+                    for(index_t i = 0; i < DstDataPerAccess; ++i)
+                    {
+                        auto scalar_id                   = make_zero_array<index_t, nDim>();
+                        scalar_id(dst_vector_access_dim) = i;
+
+                        const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
+                            dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
+
+                        reinterpret_cast<TData*>(&vector_data)[i] = p_buffer[buffer_offset];
+                    }
+
+                    const index_t dst_normal_offset =
+                        DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
+
+                    // write vector into dst
+                    *reinterpret_cast<dst_vector_t*>(&p_dst_tmp[dst_normal_offset]) = vector_data;
+                });
+            });
+#endif
+        }
+    }
+
+    // T can be Sequence or Array
+    template <class T, bool PositiveDirection>
+    __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    {
+        static_if<PositiveDirection>{}([&](auto) {
+            mSrcSliceOrigin += step_sizes;
+        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
+    }
+
+    template <class T, bool PositiveDirection>
+    __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    {
+        static_if<PositiveDirection>{}([&](auto) {
+            mDstSliceOrigin += step_sizes;
+        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
+    }
+
+    private:
+    SrcCoordinate mSrcSliceOrigin;
+    DstCoordinate mDstSliceOrigin;
+};
+#endif
+
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -11,6 +11,7 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0

 namespace ck {

--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -13,6 +13,7 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0

 namespace ck {

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
+//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 template <class T,
@@ -94,6 +94,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_W = 1;
 #elif 1
    // each thread hold 64 data
    constexpr index_t BlockSize = 256;
@@ -214,7 +216,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
             WeiBlockCopySrcAccessOrder,
             WeiBlockCopyDstAccessOrder,
             WeiBlockCopySrcDataPerRead_E,
-             WeiBlockCopyDstDataPerWrite_K>{};
+             WeiBlockCopyDstDataPerWrite_K,
+             OutThreadCopyDataPerAccess_W>{};

        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                   dim3(GridSize),
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"

 using namespace ck;
@@ -55,7 +55,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

 #if 1
-    // 1x1 filter, 8x8 image
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 128;
@@ -86,8 +85,45 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
+#elif 0 // debug
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<1, 4>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 1;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
 #elif 1
    // 1x1 filter, 8x8 image
    constexpr index_t BlockSize = 256;
@@ -106,13 +142,13 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t GemmDataPerReadA   = 4;
    constexpr index_t GemmDataPerReadB   = 4;

-    using InBlockCopySubLengths_E_B            = Sequence<2, 2>;
-    using InBlockCopyClusterLengths_E_B        = Sequence<4, 64>;
+    using InBlockCopySubLengths_E_B            = Sequence<1, 4>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]

-    constexpr index_t InBlockCopyDataPerAccess_B = 1;
+    constexpr index_t InBlockCopyDataPerAccess_B = 4;

    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
@@ -120,8 +156,10 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
 #endif

    constexpr index_t B = N * Ho * Wo;
@@ -169,7 +207,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
         WeiBlockCopySrcAccessOrder,
         WeiBlockCopyDstAccessOrder,
         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K>{};
+         WeiBlockCopyDstDataPerWrite_K,
+         OutThreadCopyDataPerAccess_B>{};

    for(index_t i = 0; i < nrepeat; ++i)
    {