diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
index 22c5d11564..625e0a1f85 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -6,6 +6,7 @@
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_4d_tensor_op.hpp"
+#include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_tensor_slice_copy.hpp"
 #include "threadwise_4d_tensor_op.hpp"
 #include "blockwise_batched_gemm.hpp"
@@ -122,8 +123,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
         constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
             Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 
-        // blockwise copy
-        // input: format is [C, Hi, Wi, N]
+// blockwise copy
+// input: format is [C, Hi, Wi, N]
+#if 0
         const auto blockwise_in_copy =
             Blockwise4dTensorCopy3<BlockSize,
                                    Float,
@@ -132,6 +134,39 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                    decltype(in_c_h_w_n_block_desc.GetLengths()),
                                    InBlockCopyClusterLengths_CHWN,
                                    InBlockCopyDataPerRead_N>{};
+#elif 0
+        using InBlockCopySubLengths_CHWN =
+            decltype(in_c_h_w_n_block_desc.GetLengths() / InBlockCopyClusterLengths_CHWN{});
+
+        auto blockwise_in_copy =
+            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+                                               Float,
+                                               decltype(in_c_h_w_n_global_desc),
+                                               decltype(in_c_h_w_n_block_desc),
+                                               decltype(in_c_h_w_n_block_desc.GetLengths()),
+                                               InBlockCopySubLengths_CHWN,
+                                               InBlockCopyClusterLengths_CHWN,
+                                               Sequence<0, 1, 2, 3>,
+                                               Sequence<0, 1, 2, 3>,
+                                               Sequence<0, 1, 2, 3>,
+                                               1,
+                                               1>({0, 0, 0, 0}, {0, 0, 0, 0});
+#elif 1
+        using InBlockCopySubLengths_CHWN =
+            decltype(in_c_h_w_n_block_desc.GetLengths() / InBlockCopyClusterLengths_CHWN{});
+
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
+            BlockSize,
+            Float,
+            decltype(in_c_h_w_n_global_desc),
+            decltype(in_c_h_w_n_block_desc),
+            NormalTensorCoordinate<decltype(in_c_h_w_n_global_desc)>,
+            NormalTensorCoordinate<decltype(in_c_h_w_n_block_desc)>,
+            decltype(in_c_h_w_n_block_desc.GetLengths()),
+            InBlockCopySubLengths_CHWN,
+            InBlockCopyClusterLengths_CHWN,
+            Sequence<0, 1, 2, 3>>({0, 0, 0, 0}, {0, 0, 0, 0});
+#endif
 
         // blockwise wei copy
         //   format is [CPerBlock, X * KPerBlock]
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index 1c5d6ef5e4..4ab5cf8efb 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -224,7 +224,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 
         // c_thread_mtx definition: this is a mess
         // TODO:: more elegent way of defining c_thread_mtx
-        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor(
+        constexpr auto c_k0k2_n1n2_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
             Number<GemmMRepeat * GemmMPerThreadSubC>{}, Number<N1 * N2>{});
 
         const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
@@ -258,9 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                                                 GemmDataPerReadB);
 
         constexpr index_t in_block_space =
-            in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{});
+            math::integer_least_multiple(in_e_n1_b_n2_block_desc.GetElementSpace(), max_align);
 
-        constexpr index_t wei_block_space = wei_e_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t wei_block_space =
+            math::integer_least_multiple(wei_e_k_block_desc.GetElementSpace(), max_align);
 
         __shared__ Float p_in_block[in_block_space];
         __shared__ Float p_wei_block[wei_block_space];
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 40a44ee5df..2fd7c9c6bd 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -161,7 +161,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
             Float,
             decltype(wei_e_k_global_desc),
             decltype(wei_e_k_block_desc),
-            MergedTensorCoordinate<decltype(wei_e_k_global_desc)>,
+            NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
             NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
             decltype(wei_e_k_block_desc.GetLengths()),
             WeiBlockCopySubLengths_E_K,
@@ -301,7 +301,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
                                                         b_thread_data_on_global % B1});
 
             threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
-#else
+#elif 0
             // This is a hack, because slicing a merged dimension is not supported yet.
             // This should be replaced with logic above, once slicing a merged dimension support
             // become available
@@ -328,6 +328,49 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
                  k_thread_data_on_global % K1,
                  b_thread_data_on_global});
 
+            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
+            {
+                threadwise_out_copy.Run(p_out_thread, p_out_global);
+
+                threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
+                threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
+            }
+#elif 1
+            // This is a hack, because slicing a merged dimension is not supported yet.
+            // This should be replaced with logic above, once slicing a merged dimension support
+            // become available
+            //     dst descriptor
+            constexpr auto out_k0_k1_b_global_desc =
+                make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
+                                                    Sequence<1>{},
+                                                    Sequence<2>{},
+                                                    Sequence<0, 3, 4>{});
+
+            //     src descriptor
+            constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
+
+            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
+                Float,
+#if 1 // debug
+                decltype(out_k0_k1_b_thread_desc),
+                decltype(out_k0_k1_b_global_desc),
+                NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
+                MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
+#else
+                decltype(out_k0_k1_b_thread_desc),
+                decltype(
+                    make_ConstantTensorDescriptor_packed(out_k0_k1_b_global_desc.GetLengths())),
+                NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
+                NormalTensorCoordinate<decltype(
+                    make_ConstantTensorDescriptor_packed(out_k0_k1_b_global_desc.GetLengths()))>,
+#endif
+                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>>(
+                {0, 0, 0},
+                {k_thread_data_on_global / K1,
+                 k_thread_data_on_global % K1,
+                 b_thread_data_on_global});
+
             for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
             {
                 threadwise_out_copy.Run(p_out_thread, p_out_global);
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 4ab3894636..98de85bf6a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -93,6 +93,17 @@ struct NormalTensorCoordinate
         return coord;
     }
 
+    // reposition point of origin, and return compensated offset
+    __host__ __device__ constexpr index_t RepositionOrigin()
+    {
+        index_t offset_diff = mOffset;
+
+        mIndex  = make_zero_array<index_t, nDim>();
+        mOffset = 0;
+
+        return offset_diff;
+    }
+
     // private:
     Array<index_t, nDim> mIndex;
     index_t mOffset;
@@ -305,6 +316,29 @@ struct MergedTensorCoordinate
         return coord;
     }
 
+    // reposition point of origin, and return compensated offset
+    __host__ __device__ constexpr index_t RepositionOrigin()
+    {
+        index_t offset_diff = 0;
+
+        static_for<0, nDim, 1>{}([&](auto idim_) {
+            constexpr auto idim = decltype(idim_){};
+
+            static_if<!tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
+                constexpr auto idim_original =
+                    tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
+
+                mIndex(idim)                  = 0;
+                mOriginalIndex(idim_original) = 0;
+                mOffset -= mPartialOffsets[idim];
+                offset_diff += mPartialOffsets[idim];
+                mPartialOffsets(idim) = 0;
+            });
+        });
+
+        return offset_diff;
+    }
+
     // private:
     Array<index_t, nDim> mIndex;
     Array<index_t, nOriginalDim> mOriginalIndex;
@@ -312,18 +346,5 @@ struct MergedTensorCoordinate
     index_t mOffset;
 };
 
-#if 0
-// implementation of MergedTensorCoordinate, when index_t is signed integer
-// mPartialOffsets is not needed, if index_t is signed integer type
-template<>
-struct TensorCoordinate<signed_t>
-{
-    private:
-    Array<_t, nDim> mIndex;
-    Array<_t, nOriginalDim> mOriginalIndex;
-    index_t mOffset;
-};
-#endif
-
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index d005cb84a2..7844856079 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -140,11 +140,23 @@ struct ThreadwiseGenericTensorSliceCopy_v2
 
     __device__ void Run(const TData* p_src, TData* p_dst) const
     {
+#if 0
         static_ford<SliceLengths>{}([&](auto data_id) {
             p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
                 p_src[(mSrcSliceOrigin + data_id).GetOffset()];
-
         });
+#elif 1
+        auto src_slice_origin = mSrcSliceOrigin;
+        auto dst_slice_origin = mDstSliceOrigin;
+
+        p_src += src_slice_origin.RepositionOrigin();
+        p_dst += dst_slice_origin.RepositionOrigin();
+
+        static_ford<SliceLengths>{}([&](auto data_id) {
+            p_dst[(dst_slice_origin + data_id).GetOffset()] =
+                p_src[(src_slice_origin + data_id).GetOffset()];
+        });
+#endif
     }
 
     __device__ void MoveSrcSlicingWindow(Array<index_t, nDim> step_sizes, bool positive_direction)
diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
index 713834400d..0f5305d196 100644
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
             GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #elif 0
             GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
-#elif 0
-            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #elif 1
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
+#elif 0
             GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #endif
             <GridSize,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 0deb5d329c..56d20bc20a 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
-#elif 1
+#elif 0
     // 3x3, 34x34
     constexpr index_t N  = 128;
     constexpr index_t C  = 256;
@@ -520,7 +520,7 @@ int main(int argc, char* argv[])
 #if 0
     device_convolution_direct_v2_nchw_kcyx_nkhw
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
     device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
         in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
 #elif 0