diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
index d9d948415f..6cedb7f02a 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -7,6 +7,10 @@
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_batched_gemm.hpp"
+#include "blockwise_2d_tensor_op.hpp"
+#include "blockwise_4d_tensor_op.hpp"
+#include "threadwise_tensor_slice_copy.hpp"
+#include "threadwise_4d_tensor_op.hpp"
 
 namespace ck {
 
@@ -129,10 +133,20 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
         constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
             Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 
+#if 1
         // blockwise copy
         // input: format is [C, Hi, Wi, N]
+        const auto blockwise_in_copy =
+            Blockwise4dTensorCopy3<BlockSize,
+                                   Float,
+                                   decltype(in_c_h_w_n_global_desc),
+                                   decltype(in_c_h_w_n_block_desc),
+                                   decltype(in_c_h_w_n_block_desc.GetLengths()),
+                                   InBlockCopyClusterLengths_CHWN,
+                                   InBlockCopyDataPerAccess_N>{};
+#else
         auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
+            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
                                                decltype(in_c_h_w_n_global_desc),
                                                decltype(in_c_h_w_n_block_desc),
                                                decltype(in_c_h_w_n_block_desc.GetLengths()),
@@ -146,11 +160,21 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                InBlockCopyDataPerAccess_N,
                                                InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
                                                                            {0, 0, 0, 0});
+#endif
 
+#if 1
         // blockwise wei copy
         //   format is [CPerBlock, X * KPerBlock]
         const auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
+            Blockwise2dTensorCopy3<BlockSize,
+                                   Float,
+                                   decltype(wei_c_k_global_desc),
+                                   decltype(wei_c_k_block_desc),
+                                   decltype(wei_c_k_block_desc.GetLengths()),
+                                   WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
+#else
+        const auto blockwise_wei_copy =
+            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
                                                decltype(wei_c_k_global_desc),
                                                decltype(wei_c_k_block_desc),
                                                decltype(wei_c_k_block_desc.GetLengths()),
@@ -163,6 +187,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                1,
                                                WeiBlockCopyDataPerAccess_K,
                                                WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
+#endif
 
         // a series of blockwise batched GEMM
         // C_matrix += transpose(A_matrix) * B_matrix
@@ -402,7 +427,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                 wo_block_data_begin + wo_thread_data_begin,
                                                 n_block_data_begin + n_thread_data_begin);
 
-            ThreadwiseGenericTensorSliceCopy_v2r1<decltype(out_10d_thread_desc),
+#if 1
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
+                                         p_out_thread,
+                                         out_10d_global_desc,
+                                         p_out_thread_on_global,
+                                         out_10d_thread_desc.GetLengths(),
+                                         Number<OutThreadCopyDataPerAccess_N>{});
+#else
+            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
                                                   decltype(out_10d_global_desc),
                                                   decltype(out_10d_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 10, 1>::type,
@@ -413,6 +446,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                   OutThreadCopyDataPerAccess_N>(
                 make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
+#endif
         }).Else([&](auto fwd) {
             static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                               GemmNPerThreadSubC % NPerThread == 0,
@@ -460,7 +494,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                 wo_block_data_begin + wo_thread_data_begin,
                                                 n_block_data_begin + n_thread_data_begin);
 
-            ThreadwiseGenericTensorSliceCopy_v2r1<decltype(out_10d_thread_desc),
+#if 1
+            threadwise_tensor_slice_copy(out_10d_thread_desc,
+                                         p_out_thread,
+                                         out_10d_global_desc,
+                                         p_out_thread_on_global,
+                                         out_10d_thread_desc.GetLengths(),
+                                         Number<OutThreadCopyDataPerAccess_N>{});
+#else
+            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
                                                   decltype(out_10d_global_desc),
                                                   decltype(out_10d_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 10, 1>::type,
@@ -471,6 +513,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                   OutThreadCopyDataPerAccess_N>(
                 make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
+#endif
         });
     }
 };
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 77ed7c052b..8fa701ccee 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -301,14 +301,14 @@ struct TensorCoordinate
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
     {
         return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
     {
         return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
     }
diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
index 11908b0816..8fb5c79e07 100644
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -143,7 +143,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
     constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
 
     constexpr index_t OutThreadCopyDataPerAccess_N = 2;
-#elif 1
+#elif 0
     // for 3x3, 34x34, v1r3, Pascal
     // for 3x3, 28x28, v1r3, Pascal
     // for 3x3, 14x14, v1r3, Pascal
@@ -266,9 +266,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
     constexpr index_t GemmDataPerReadA   = 4;
     constexpr index_t GemmDataPerReadB   = 4;
 
+    using InBlockCopySubLengths_CHWN             = Sequence<1, 1, 1, 4>;
     using InBlockCopyClusterLengths_CHWN         = Sequence<8, 2, 4, 4>;
     constexpr index_t InBlockCopyDataPerAccess_N = 4;
 
+    using WeiBlockCopySubLengths_CK               = Sequence<1, 4>;
+    using WeiBlockCopyClusterLengths_CK           = Sequence<8, 32>;
     constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
 
     constexpr index_t OutThreadCopyDataPerAccess_N = 4;
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 7ea05e243e..40d6c5b6fe 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
 {
     using namespace ck;
 
-#if 1
+#if 0
     constexpr index_t N  = 64;
     constexpr index_t C  = 1536;
     constexpr index_t HI = 8;