From 724e984bfffdbe45b98d31c349d24998ed58b541 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 11 Sep 2019 01:13:13 -0500
Subject: [PATCH] enabling padding for chwn format

---
 ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp | 371 ++++++++----------
 .../multi_index_transform.hpp                 |  65 ++-
 .../tensor_coordinate_v2.hpp                  | 140 +++++--
 .../tensor_description/tensor_descriptor.hpp  |  78 +++-
 .../blockwise_generic_tensor_slice_copy.hpp   | 161 +++++++-
 .../threadwise_generic_tensor_slice_copy.hpp  | 167 +++++++-
 composable_kernel/include/utility/array.hpp   |  17 +-
 .../include/utility/array_helper.hpp          |  88 ++++-
 .../include/utility/config_nvidia.hpp.in      |  11 +-
 .../include/utility/sequence.hpp              |   6 +-
 ...implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp |   2 +-
 driver/src/driver.cpp                         |  44 +--
 12 files changed, 817 insertions(+), 333 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
index 2c5e1e087b..cfa8e9f02d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
@@ -18,8 +18,8 @@ template <index_t GridSize,
           class InGlobalDesc,
           class WeiGlobalDesc,
           class OutGlobalDesc,
-          class LowerPads,
-          class UpperPads,
+          class LeftPads,
+          class RightPads,
           index_t NPerBlock,
           index_t KPerBlock,
           index_t CPerBlock,
@@ -60,7 +60,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
     static constexpr auto I10 = Number<10>{};
     static constexpr auto I11 = Number<11>{};
 
-#if 0
+    static constexpr auto True  = integral_constant<bool, true>{};
+    static constexpr auto False = integral_constant<bool, false>{};
+
+#if 1
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
@@ -73,14 +76,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
                   GemmNPerThreadSubC % NPerThread == 0)),
             "wrong!");
 
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
+        constexpr auto in_c_h_w_n_global_desc_old  = InGlobalDesc{};
+        constexpr auto wei_c_y_x_k_global_desc_old = WeiGlobalDesc{};
+        constexpr auto out_k_h_w_n_global_desc_old = OutGlobalDesc{};
 
-        constexpr auto in_c_h_w_n_global_desc  = InGlobalDesc{};
-        constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{};
-        constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{};
+        constexpr auto in_c_h_w_n_global_desc = make_native_tensor_descriptor(
+            in_c_h_w_n_global_desc_old.GetLengths(), in_c_h_w_n_global_desc_old.GetStrides());
 
-        constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0);
+        constexpr auto wei_c_y_x_k_global_desc = make_native_tensor_descriptor(
+            wei_c_y_x_k_global_desc_old.GetLengths(), wei_c_y_x_k_global_desc_old.GetStrides());
+
+        constexpr auto out_k_h_w_n_global_desc = make_native_tensor_descriptor(
+            out_k_h_w_n_global_desc_old.GetLengths(), out_k_h_w_n_global_desc_old.GetStrides());
+
+        constexpr index_t C  = in_c_h_w_n_global_desc.GetLength(I0);
+        constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
+        constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
 
         constexpr index_t K  = out_k_h_w_n_global_desc.GetLength(I0);
         constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1);
@@ -111,11 +122,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
         const index_t wo_block_data_begin = block_work_multi_id[2] * WoPerBlock;
         const index_t n_block_data_begin  = block_work_multi_id[3] * NPerBlock;
 
-        const index_t hi_block_data_begin = ho_block_data_begin;
-        const index_t wi_block_data_begin = wo_block_data_begin;
+        const index_t hi_block_data_begin = ho_block_data_begin - LeftPads{}[0];
+        const index_t wi_block_data_begin = wo_block_data_begin - LeftPads{}[1];
+
+        // input global tensor view
+        constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
+            in_c_h_w_n_global_desc,
+            make_tuple(
+                PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<N>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
 
         // global tensor view
-        constexpr auto wei_c_k_global_desc = wei_c_y_x_k_global_desc.Extract(I0, I3);
+        constexpr auto wei_c_k_global_desc_old = wei_c_y_x_k_global_desc_old.Extract(I0, I3);
+
+        constexpr auto wei_c_k_global_desc = make_native_tensor_descriptor(
+            wei_c_k_global_desc_old.GetLengths(), wei_c_k_global_desc_old.GetStrides());
 
         // LDS tensor view
         //   be careful of alignment
@@ -124,122 +146,81 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
                                                 GemmDataPerReadA,
                                                 GemmDataPerReadB);
 
-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto in_c_h_w_n_block_desc_old = make_ConstantTensorDescriptor_aligned(
             Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{}, Number<max_align>{});
 
+        // hack
+        constexpr auto in_c_h_w_n_block_desc = make_native_tensor_descriptor(
+            in_c_h_w_n_block_desc_old.GetLengths(), in_c_h_w_n_block_desc_old.GetStrides());
+
         // this check is ad-hoc
         // TODO: need to properly implement tensor descriptor with alignment
         static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                       "GemmDataPerReadB alignment requirement is not meet");
 
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
+        constexpr auto wei_c_k_block_desc_old = make_ConstantTensorDescriptor_aligned(
             Sequence<CPerBlock, KPerBlock>{}, Number<max_align>{});
 
-        constexpr auto wei_c_1_1_k_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, 1, 1, KPerBlock>{}, Number<max_align>{});
+        constexpr auto wei_c_k_block_desc = make_native_tensor_descriptor(
+            wei_c_k_block_desc_old.GetLengths(), wei_c_k_block_desc_old.GetStrides());
 
         // LDS: be careful of alignment
-        constexpr index_t in_block_space  = in_c_h_w_n_block_desc.GetElementSpace();
-        constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace();
+        constexpr index_t in_block_space  = in_c_h_w_n_block_desc_old.GetElementSpace();
+        constexpr index_t wei_block_space = wei_c_k_block_desc_old.GetElementSpace();
 
         __shared__ Float p_in_block[in_block_space];
         __shared__ Float p_wei_block[wei_block_space];
 
         // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
+        constexpr auto out_k_h_w_n_thread_desc_old = make_ConstantTensorDescriptor_packed(
             Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});
 
-#if 1
+        constexpr auto out_k_h_w_n_thread_desc = make_native_tensor_descriptor(
+            out_k_h_w_n_thread_desc_old.GetLengths(), out_k_h_w_n_thread_desc_old.GetStrides());
+
         // blockwise input copy
         //   format is [C, Hi, Wi, N]
         auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(in_c_h_w_n_global_desc),
-                                               decltype(in_c_h_w_n_block_desc),
-                                               decltype(in_c_h_w_n_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_CHWN,
-                                               InBlockCopyClusterLengths_CHWN,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               3,
-                                               3,
-                                               InBlockCopyDataPerAccess_N,
-                                               InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
-                                                                           {0, 0, 0, 0});
+#if 0
+            BlockwiseGenericTensorSliceCopy_v2
 #else
-        auto in_c_h_w_n_global = make_TensorView(in_c_h_w_n_global_desc, p_in_global);
-        auto in_c_h_w_n_block  = make_TensorView(in_c_h_w_n_block_desc, p_in_block);
-
-        auto blockwise_in_copy =
-            BlockwiseGenericTensorSliceCopy_v3<BlockSize,
-                                               decltype(in_c_h_w_n_global),
-                                               decltype(in_c_h_w_n_block),
-                                               decltype(in_c_h_w_n_block.GetLengths()),
-                                               InBlockCopySubLengths_CHWN,
-                                               InBlockCopyClusterLengths_CHWN,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               3,
-                                               3,
-                                               InBlockCopyDataPerAccess_N,
-                                               InBlockCopyDataPerAccess_N>(
-                in_c_h_w_n_global,
-                {0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin},
-                in_c_h_w_n_block,
-                {0, 0, 0, 0});
+            BlockwiseGenericTensorSliceCopy_v4
 #endif
+            <BlockSize,
+             decltype(in_c_hp_wp_n_global_desc),
+             decltype(in_c_h_w_n_block_desc),
+             decltype(in_c_h_w_n_block_desc.GetLengths()),
+             InBlockCopySubLengths_CHWN,
+             InBlockCopyClusterLengths_CHWN,
+             Sequence<0, 1, 2, 3>,
+             Sequence<0, 1, 2, 3>,
+             Sequence<0, 1, 2, 3>,
+             3,
+             3,
+             InBlockCopyDataPerAccess_N,
+             InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});
 
-#if 1
         // blockwise wei copy
         //   format is [CPerBlock, KPerBlock]
         const auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v2<BlockSize,
-                                               decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_CK,
-                                               WeiBlockCopyClusterLengths_CK,
-                                               Sequence<0, 1>,
-                                               Sequence<0, 1>,
-                                               Sequence<0, 1>,
-                                               1,
-                                               1,
-                                               WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
+#if 0
+            BlockwiseGenericTensorSliceCopy_v2
 #else
-        auto wei_c_y_x_k_global = make_TensorView(wei_c_y_x_k_global_desc, p_wei_global);
-        auto wei_c_1_1_k_block  = make_TensorView(wei_c_1_1_k_block_desc, p_wei_block);
-
-        constexpr index_t WeiBlockCopySubLengths_C = WeiBlockCopySubLengths_CK{}[0];
-        constexpr index_t WeiBlockCopySubLengths_K = WeiBlockCopySubLengths_CK{}[1];
-
-        using WeiBlockCopySubLengths_CYXK =
-            Sequence<WeiBlockCopySubLengths_C, 1, 1, WeiBlockCopySubLengths_K>;
-
-        constexpr index_t WeiBlockCopyClusterLengths_C = WeiBlockCopyClusterLengths_CK{}[0];
-        constexpr index_t WeiBlockCopyClusterLengths_K = WeiBlockCopyClusterLengths_CK{}[1];
-
-        using WeiBlockCopyClusterLengths_CYXK =
-            Sequence<WeiBlockCopyClusterLengths_C, 1, 1, WeiBlockCopyClusterLengths_K>;
-
-        auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v3<BlockSize,
-                                               decltype(wei_c_y_x_k_global),
-                                               decltype(wei_c_1_1_k_block),
-                                               decltype(wei_c_1_1_k_block.GetLengths()),
-                                               WeiBlockCopySubLengths_CYXK,
-                                               WeiBlockCopyClusterLengths_CYXK,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
-                                               3,
-                                               3,
-                                               WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>(
-                wei_c_y_x_k_global, {0, 0, 0, k_block_data_begin}, wei_c_1_1_k_block, {0, 0, 0, 0});
+            BlockwiseGenericTensorSliceCopy_v4
 #endif
+            <BlockSize,
+             decltype(wei_c_k_global_desc),
+             decltype(wei_c_k_block_desc),
+             decltype(wei_c_k_block_desc.GetLengths()),
+             WeiBlockCopySubLengths_CK,
+             WeiBlockCopyClusterLengths_CK,
+             Sequence<0, 1>,
+             Sequence<0, 1>,
+             Sequence<0, 1>,
+             1,
+             1,
+             WeiBlockCopyDataPerAccess_K,
+             WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
 
         // a series of blockwise batched GEMM
         // C_matrix += transpose(A_matrix) * B_matrix
@@ -283,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
 
         // register
         // C++ lambda doesn't capture array, use pointer instead
-        Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()];
+        Float p_out_thread_data[out_k_h_w_n_thread_desc_old.GetElementSpace()];
         Float* const p_out_thread = p_out_thread_data;
 
         // set threadwise output tensor to 0
@@ -296,12 +277,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
             {
                 const Float* p_in_global_block_offset =
                     p_in_global +
-                    in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                        0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin);
+                    in_c_h_w_n_global_desc.CalculateOffset(
+                        {0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin});
 
                 const Float* p_wei_global_block_offset =
                     p_wei_global +
-                    wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin);
+                    wei_c_y_x_k_global_desc.CalculateOffset({0, y, x, k_block_data_begin});
 
                 for(index_t c_block_data_begin = 0; c_block_data_begin < C;
                     c_block_data_begin += CPerBlock,
@@ -390,25 +371,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
             constexpr index_t K2 = GemmMPerThreadSubC;
             constexpr index_t K1 = KPerBlock / KPerThread;
 
-            constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc)
-                                                     .Fold(I3, Number<N1>{}, Number<N2>{})
-                                                     .Fold(I2, Number<W1>{}, Number<W2>{})
-                                                     .Fold(I0, Number<K1>{}, Number<K2>{});
+            constexpr auto out_10d_global_desc_old = fwd(out_k_h_w_n_global_desc_old)
+                                                         .Fold(I3, Number<N1>{}, Number<N2>{})
+                                                         .Fold(I2, Number<W1>{}, Number<W2>{})
+                                                         .Fold(I0, Number<K1>{}, Number<K2>{});
 
-            constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc)
-                                                     .Fold(I3, Number<1>{}, Number<N2>{})
-                                                     .Fold(I2, Number<W1>{}, Number<1>{})
-                                                     .Fold(I0, Number<1>{}, Number<K2>{});
+            constexpr auto out_10d_global_desc = make_native_tensor_descriptor(
+                out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides());
 
-            Float* p_out_thread_on_global = p_out_global +
-                                            out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                                                k_block_data_begin + k_thread_data_begin,
-                                                ho_block_data_begin + ho_thread_data_begin,
-                                                wo_block_data_begin + wo_thread_data_begin,
-                                                n_block_data_begin + n_thread_data_begin);
+            constexpr auto out_10d_thread_desc_old = fwd(out_k_h_w_n_thread_desc_old)
+                                                         .Fold(I3, Number<1>{}, Number<N2>{})
+                                                         .Fold(I2, Number<W1>{}, Number<1>{})
+                                                         .Fold(I0, Number<1>{}, Number<K2>{});
 
-#if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            constexpr auto out_10d_thread_desc = make_native_tensor_descriptor(
+                out_10d_thread_desc_old.GetLengths(), out_10d_thread_desc_old.GetStrides());
+
+            Float* p_out_thread_on_global =
+                p_out_global +
+                out_k_h_w_n_global_desc.CalculateOffset({k_block_data_begin + k_thread_data_begin,
+                                                         ho_block_data_begin + ho_thread_data_begin,
+                                                         wo_block_data_begin + wo_thread_data_begin,
+                                                         n_block_data_begin + n_thread_data_begin});
+
+            ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_10d_thread_desc),
                                                   decltype(out_10d_global_desc),
                                                   decltype(out_10d_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 10, 1>::type,
@@ -417,19 +403,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
                                                   OutThreadCopyDataPerAccess_N>(
                 make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
-#elif 0
-            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
-                .Run(p_out_thread, p_out_thread_on_global);
-#endif
         }).Else([&](auto fwd) {
             static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock &&
                               GemmNPerThreadSubC % NPerThread == 0,
@@ -445,27 +418,32 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
             constexpr index_t K2 = GemmMPerThreadSubC;
             constexpr index_t K1 = KPerBlock / KPerThread;
 
-            constexpr auto out_10d_global_desc =
-                fwd(out_k_h_w_n_global_desc)
+            constexpr auto out_10d_global_desc_old =
+                fwd(out_k_h_w_n_global_desc_old)
                     .Fold(I3, Number<N1>{})
                     .Fold(I2, Number<W1>{}, Number<W2>{}, Number<W3>{})
                     .Fold(I0, Number<K1>{}, Number<K2>{});
 
-            constexpr auto out_10d_thread_desc =
-                fwd(out_k_h_w_n_thread_desc)
+            constexpr auto out_10d_global_desc = make_native_tensor_descriptor(
+                out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides());
+
+            constexpr auto out_10d_thread_desc_old =
+                fwd(out_k_h_w_n_thread_desc_old)
                     .Fold(I3, Number<N1>{})
                     .Fold(I2, Number<W1>{}, Number<1>{}, Number<W3>{})
                     .Fold(I0, Number<1>{}, Number<K2>{});
 
-            Float* p_out_thread_on_global = p_out_global +
-                                            out_k_h_w_n_global_desc.GetOffsetFromMultiIndex(
-                                                k_block_data_begin + k_thread_data_begin,
-                                                ho_block_data_begin + ho_thread_data_begin,
-                                                wo_block_data_begin + wo_thread_data_begin,
-                                                n_block_data_begin + n_thread_data_begin);
+            constexpr auto out_10d_thread_desc = make_native_tensor_descriptor(
+                out_10d_thread_desc_old.GetLengths(0), out_10d_thread_desc_old.GetStrides());
 
-#if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            Float* p_out_thread_on_global =
+                p_out_global +
+                out_k_h_w_n_global_desc.CalculateOffset({k_block_data_begin + k_thread_data_begin,
+                                                         ho_block_data_begin + ho_thread_data_begin,
+                                                         wo_block_data_begin + wo_thread_data_begin,
+                                                         n_block_data_begin + n_thread_data_begin});
+
+            ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_10d_thread_desc),
                                                   decltype(out_10d_global_desc),
                                                   decltype(out_10d_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 10, 1>::type,
@@ -474,58 +452,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
                                                   OutThreadCopyDataPerAccess_N>(
                 make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
                 .Run(p_out_thread, p_out_thread_on_global);
-#elif 0
-            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_global_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  9,
-                                                  9,
-                                                  OutThreadCopyDataPerAccess_N,
-                                                  OutThreadCopyDataPerAccess_N>(
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
-                .Run(p_out_thread, p_out_thread_on_global);
-#endif
         });
     }
-#else
+#elif 0
     __device__ void Run(const Float* const __restrict__ p_in_global,
                         const Float* const __restrict__ p_wei_global,
                         Float* const __restrict__ p_out_global) const
     {
-#if 0
-        constexpr auto a = make_tuple(true, Sequence<1>{}, index_t(99));
-
-        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-        {
-            printf("[0] %d\n", a.At(I0));
-            print_Sequence("[1]", a.At(I1));
-            printf("[2] %lu\n", a.At(I2));
-        }
-
-        bool flag = true;
-
-        auto b = make_tuple(flag, Sequence<1>{}, 99);
-
-        b.At(I0) = false;
-
-        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-        {
-            printf("[0] %d\n", b.At(I0));
-            print_Sequence("[1]", b.At(I1));
-            printf("[2] %lu\n", b.At(I2));
-
-            printf("flag %d\n", flag);
-        }
-
-        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-        {
-            printf("[0] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I0));
-            print_Sequence("[1]", make_tuple(true, Sequence<1>(), index_t(99)).At(I1));
-            printf("[2] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I2));
-        }
-#elif 1
         // create a native tensor descriptor
         constexpr auto in_c_h_w_n_global_desc =
             make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
@@ -540,11 +473,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
         constexpr auto in_n_c_hp_wp_global_desc = transform_tensor_descriptor(
             in_c_h_w_n_global_desc,
             make_tuple(
-                Pad<Sequence<Hi, Wi>, LowerPads, UpperPads>{}, PassThrough<C>{}, PassThrough<N>{}),
+                Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<C>{}, PassThrough<N>{}),
             make_tuple(Sequence<1, 2>{}, Sequence<0>{}, Sequence<3>{}),
             make_tuple(Sequence<2, 3>{}, Sequence<1>{}, Sequence<0>{}));
 
-#if 1
         // transformation: {n, c, hp, wp} --> {c, b}
         //   {n, hp, wp} --> {b}, {c} --> {c}
         constexpr auto in_c_b_global_desc = transform_tensor_descriptor(
@@ -553,9 +485,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
                        PassThrough<in_n_c_hp_wp_global_desc.GetLength(I1)>{}),
             make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}),
             make_tuple(Sequence<1>{}, Sequence<0>{}));
-#endif
 
-#if 1
         if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
         {
             // 0
@@ -577,16 +507,55 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded
 
             printf("in_c_b_global_desc offset: %lu\n", in_c_b_global_desc.CalculateOffset(idx2));
         }
+    }
 #else
-        {
-            index_t c = static_cast<index_t>(threadIdx.x);
-            index_t h = static_cast<index_t>(threadIdx.y);
-            index_t w = static_cast<index_t>(threadIdx.z);
+    __device__ void Run(const Float* const __restrict__ p_in_global,
+                        const Float* const __restrict__ p_wei_global,
+                        Float* const __restrict__ p_out_global) const
+    {
+        // create a native tensor descriptor
+        constexpr auto in_c_h_w_n_global_desc =
+            make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides());
 
-            p_out_global[0] = in_n_c_h_w_padded_global_desc.CalculateOffset({1, c, h, w});
+        constexpr index_t C  = in_c_h_w_n_global_desc.GetLength(I0);
+        constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1);
+        constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2);
+        constexpr index_t N  = in_c_h_w_n_global_desc.GetLength(I3);
+
+        // transformation: {c, h, w, n} --> {n, c, hp, wp}
+        //   {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n}
+        constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor(
+            in_c_h_w_n_global_desc,
+            make_tuple(
+                PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}, PassThrough<N>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+        {
+            // 0
+            print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc);
+
+            // 1
+            print_tensor_descriptor("in_c_hp_wp_n_global_desc", in_c_hp_wp_n_global_desc);
+
+            constexpr auto idx1 = MultiIndex<4>{1, 2, 3, 4};
+            auto idx0           = in_c_hp_wp_n_global_desc.CalculateLowerIndex(idx1);
+
+            print_array("idx1: ", idx1);
+            print_array("idx0: ", idx0);
+
+            auto coord1 = make_tensor_coordinate_v2(in_c_hp_wp_n_global_desc, idx1);
+
+            print_array("1: ", coord1.GetIndex());
+            print_array("0: ", coord1.GetLowerCoordinate().GetIndex());
+
+            printf("in_c_hp_wp_n_global_desc is_in_pad: %d\n",
+                   coord1.IsAnyLevelIndexInPaddingArea());
+
+            printf("in_c_hp_wp_n_global_desc offset: %lu\n",
+                   in_c_hp_wp_n_global_desc.CalculateOffset(idx1));
         }
-#endif
-#endif
     }
 #endif
 };
diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp
index a89eb2dfb8..d26b4f1efa 100644
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -22,17 +22,27 @@ struct PassThrough
 
     __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up)
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
     {
         return idx_up;
     }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff)
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
     {
         return idx_up_diff;
     }
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool
+    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    {
+        return false;
+    }
 };
 
 // LowLengths: Sequence<...>
@@ -55,17 +65,39 @@ struct Pad
         return GetLowerLengths() + LeftPads{} + RightPads{};
     }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up)
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
     {
         return idx_up - LeftPads{};
     }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff)
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
     {
         return idx_up_diff;
     }
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) const
+    {
+        bool flag = false;
+
+        static_for<0, nDim, 1>{}([&](auto idim) {
+            // only check if there is left-padding
+            static_if<(LeftPads::At(idim) != 0)>{}(
+                [&](auto) { flag = flag || idx_up[idim] < LeftPads::At(idim); });
+
+            // only check if there is right-padding
+            static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
+                flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowLengths::At(idim);
+            });
+        });
+
+        return flag;
+    }
 };
 
 // LowLengths: Sequence<...>
@@ -124,7 +156,7 @@ struct Merge
                 .PushBack(Number<1>{});
 
 // calculate index in each of the dimensions in the order of their dimension
-#if 1
+#if 1 // would compile to same ISA?
         static_for<0, nDimLow - 1, 1>{}(
             lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
 
@@ -138,8 +170,10 @@ struct Merge
     }
 
     // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
-    __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
-                                                                      const LowerIndex& idx_low_old)
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& idx_low_old)
     {
         LowerIndex idx_low_diff;
 
@@ -149,6 +183,13 @@ struct Merge
     }
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool
+    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    {
+        return false;
+    }
 };
 
 // UpLengths: Sequence<...>
@@ -189,7 +230,10 @@ struct Unmerge
         return idx_low;
     }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff)
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
     {
         return CalculateLowerIndex(idx_up_diff);
     }
@@ -240,7 +284,10 @@ struct Embed
         return idx_low;
     }
 
-    __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff)
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
     {
         LowerIndex idx_low_diff{0};
 
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index 7c8f3a390e..b7191f6c8b 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -3,26 +3,28 @@
 
 #include "common_header.hpp"
 #include "dimension.hpp"
-#include "dimension_transform.hpp"
+#include "multi_index_transform.hpp"
 #include "tensor_descriptor.hpp"
 
 namespace ck {
 
-template <class NativeTensorDesc>
+template <typename TensorDesc>
+struct TensorCoordinate_v2;
+
+template <typename NativeTensorDesc>
 struct NativeTensorCoordinate
 {
-    using type             = NativeTensorCoordinate;
-    using tensor_desc_type = NativeTensorDesc;
-    using Index            = tensor_desc_type::Index;
-
-    static constexpr index_t nDim = Index::GetSize();
+    using type                    = NativeTensorCoordinate;
+    using tensor_desc_type        = NativeTensorDesc;
+    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;
 
     __host__ __device__ constexpr NativeTensorCoordinate(Index idx)
-        : mOffset{GetTensorDesriptor().GetOffset(idx)}
+        : mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx))
     {
     }
 
-    template <class... Xs>
+    template <typename... Xs>
     __host__ __device__ constexpr NativeTensorCoordinate(Xs... xs)
         : NativeTensorCoordinate(Index{xs...})
     {
@@ -36,82 +38,103 @@ struct NativeTensorCoordinate
 
     __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
 
-    __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
+    __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }
 
-    __host__ __device__ type operator+=(Index idx_diff)
+    __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
+
+    __host__ __device__ constexpr type operator+=(const Index& idx_diff)
     {
-        mOffset += tensor_desc_type::GetOffsetDiff(idx_diff);
+        // mIndex is updated here, but some (or all) of its entries may never be used
+        mIndex += idx_diff;
+
+        mOffset += tensor_desc_type::CalculateOffset(idx_diff);
 
         return *this;
     }
 
-    __host__ __device__ type operator-=(Index idx_diff)
+    __host__ __device__ constexpr type operator-=(const Index& idx_diff)
     {
-        mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(idx_diff);
+        // mIndex is updated here, but some (or all) of its entries may never be used
+        mIndex -= idx_diff;
+
+        mOffset -= tensor_desc_type::CalculateOffset(idx_diff);
 
         return *this;
     }
 
-    __host__ __device__ constexpr type operator+(Index idx_diff) const
+    __host__ __device__ constexpr type operator+(const Index& idx_diff) const
     {
         type coord = *this;
         coord += idx_diff;
         return coord;
     }
 
-    __host__ __device__ constexpr type operator-(Index idx_diff) const
+    __host__ __device__ constexpr type operator-(const Index& idx_diff) const
     {
         type coord = *this;
         coord -= idx_diff;
         return coord;
     }
 
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool IsAnyLevelIndexInPaddingArea() { return false; }
+
     private:
+    // mIndex may be saved and update, however, the value of some (or all) of its entries may
+    //   never be used. Compiler should be able to remove these entries as well as its calculation
+    //   as dead code.
+    // TODO: make sure compiler indeed remove these dead code
+    Index mIndex;
     index_t mOffset;
 };
 
-template <class TransformedTensorDesc>
+template <typename TransformedTensorDesc>
 struct TransformedTensorCoordinate
 {
-    using type             = TransformedTensorCoordinate;
     using tensor_desc_type = TransformedTensorDesc;
-    using Index            = tensor_desc_type::UpperIndex;
+    using LowerCoord =
+        typename TensorCoordinate_v2<decltype(tensor_desc_type::GetLowerTensorDescriptor())>::type;
+    using UpperCoord              = TransformedTensorCoordinate;
+    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
+    using UpperIndex              = MultiIndex<nDim>;
 
-    using lower_coordinate_type =
-        TensorCoordiante_v2<decltype(GetTensorDescriptor().GetLowerTensorDescriptor())>::type;
-
-    static constexpr index_t nDim = Index::GetSize();
-
-    __host__ __device__ constexpr TransformedTensorCoordinate(Index idx)
-        : mIndex{idx}, mCoordLow{GetTensorDescriptor().GetLowerIndex(idx)}
+    __host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx)
+        : mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)}
     {
     }
 
-    template <class... Xs>
+    template <typename... Xs>
     __host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs)
-        : TransformedTensorCoordinate(Index{xs...})
+        : TransformedTensorCoordinate(UpperIndex{xs...})
     {
     }
 
     template <index_t... Xs>
     __host__ __device__ constexpr TransformedTensorCoordinate(Sequence<Xs...>)
-        : TransformedTensorCoordinate(Index{Xs...})
+        : TransformedTensorCoordinate(UpperIndex{Xs...})
     {
     }
 
     __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }
 
-    __host__ __device__ constexpr index_t GetOffset() const { return mCoordLow.GetOffset(); }
+    __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; }
 
-    __host__ __device__ constexpr Index GetIndex() const { return mIndex; }
+    __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }
 
-    __host__ __device__ type operator+=(Index idx_up_diff)
+    __host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }
+
+    __host__ __device__ constexpr const index_t& GetOffset() const
+    {
+        return GetLowerCoordinate().GetOffset();
+    }
+
+    __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff)
     {
         // For transformation of multi-index difference, not all transformation functions need to
         //   know the old lower-index or the old upper-index. We pass both of them to the
         //   transformation function. The transformation function itself decides to use them or not.
-        mCoordLow +=
-            tensor_desc_type::GetLowerIndexDiff(idx_up_diff, mIndexUp, mCoordLow.GetIndex());
+        mCoordLow += tensor_desc_type::CalculateLowerIndexDiff(
+            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
 
         // mIndexUp is updated here, but some (or all) of its entries may never be used
         mIndexUp += idx_up_diff;
@@ -119,11 +142,35 @@ struct TransformedTensorCoordinate
         return *this;
     }
 
-    __host__ __device__ constexpr type operator+(Index idx_up_diff) const
+    __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff)
     {
-        type coord = *this;
-        coord += idx_diff;
-        return coord;
+        mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
+            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
+
+        mIndexUp -= idx_up_diff;
+
+        return *this;
+    }
+
+    __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const
+    {
+        UpperCoord coord_up = *this;
+        coord_up += idx_up_diff;
+        return coord_up;
+    }
+
+    __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const
+    {
+        UpperCoord coord_up = *this;
+        coord_up -= idx_up_diff;
+        return coord_up;
+    }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ constexpr bool IsAnyLevelIndexInPaddingArea() const
+    {
+        return tensor_desc_type::IsUpperIndexInPaddingArea(GetIndex()) ||
+               mCoordLow.IsAnyLevelIndexInPaddingArea();
     }
 
     private:
@@ -131,22 +178,22 @@ struct TransformedTensorCoordinate
     //   never be used. Compiler should be able to remove these entries as well as its calculation
     //   as dead code.
     // TODO: make sure compiler indeed remove these dead code
-    Index mIndexUp;
-    lower_coordinate_type mCoordLow;
+    UpperIndex mIndexUp;
+    LowerCoord mCoordLow;
 };
 
-template <class TensorDesc>
+template <typename TensorDesc>
 struct TensorCoordinate_v2
 {
     private:
-    template <class... Ts>
+    template <typename... Ts>
     __host__ __device__ static constexpr auto
     MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>();
     }
 
-    template <class... Ts>
+    template <typename... Ts>
     __host__ __device__ static constexpr auto
     MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
@@ -156,5 +203,12 @@ struct TensorCoordinate_v2
     public:
     using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
 };
+
+template <typename TensorDesc>
+__host__ __device__ constexpr auto
+make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
+{
+    return typename TensorCoordinate_v2<TensorDesc>::type(idx);
+}
 }
 #endif
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index 8ecc2cdbcf..7f65912e6d 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -64,6 +64,18 @@ struct NativeTensorDescriptor
         return GetStrides(typename arithmetic_sequence_gen<0, nDim, 1>::type{});
     }
 
+    __host__ __device__ static constexpr index_t GetElementSize()
+    {
+        return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetElementSpace()
+    {
+        return accumulate_on_sequence(
+            (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
+    }
+
+    // TODO: this cannot return constepxr because of use of lambda
     __host__ __device__ static constexpr index_t CalculateOffset(const Index& idx)
     {
         index_t offset = 0;
@@ -73,6 +85,12 @@ struct NativeTensorDescriptor
         return offset;
     }
 
+    // TODO: remove this
+    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx)
+    {
+        return CalculateOffset(idx);
+    }
+
     __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
     {
         index_t offset_diff = 0;
@@ -100,6 +118,12 @@ struct NativeTensorDescriptor
     {
         return Tuple<>{};
     }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
+    {
+        return false;
+    }
 };
 
 // LowerTensorDescriptor
@@ -248,6 +272,17 @@ struct TransformedTensorDescriptor
         return GetLengths(Sequence<IDim, IDims...>{});
     }
 
+    __host__ __device__ static constexpr index_t GetElementSize()
+    {
+        return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetElementSpace()
+    {
+        // TODO: Is this the correct definition for transformed tensor?
+        return GetLowerTensorDescriptor().GetElementSpace();
+    }
+
     // TODO: right now return value is constexpr because use of non-constepxr lambda
     __host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up)
     {
@@ -256,8 +291,8 @@ struct TransformedTensorDescriptor
         static_for<0, nTransform, 1>{}([&](auto itran) {
             constexpr auto tran = Transforms{}.At(itran);
 
-            auto idx_low_part      = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
             const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
+            auto idx_low_part      = pick_array_element(idx_low, LowDimensionIds{}.At(itran));
 
             // this assume each lower (single) index is only assocaited with one transformation,
             //   which is required for index transformation, and has been checked during constructor
@@ -269,26 +304,29 @@ struct TransformedTensorDescriptor
     }
 
     // TODO: right now return value is constexpr because use of non-constepxr lambda
-    __host__ __device__ static constexpr LowerIndex
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, const LowerIndex& idx_low_old)
+    __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
+        const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
     {
         LowerIndex idx_low_diff;
 
         static_for<0, nTransform, 1>{}([&](auto itran) {
-            constexpr auto tran = Transforms::At(itran);
+            constexpr auto tran = Transforms{}.At(itran);
 
             const auto idx_up_diff_part =
-                pick_array_element(idx_up_diff, UpDimensionIds::At(itran));
+                pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran));
 
-            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds::At(itran));
+            const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran));
 
             const auto idx_low_old_part =
-                pick_array_element(idx_low_old, LowDimensionIds::At(itran));
+                pick_array_element(idx_low_old, LowDimensionIds{}.At(itran));
+
+            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran));
 
             // this assume each lower (single) index is associated with only one transformation,
             //   which is required for index transformation, and has been checked during constructor
             //   of TransformedTensorDescriptor
-            idx_low_diff_part = tran.CalculateLowerIndex(idx_up_diff_part, idx_low_old_part);
+            idx_low_diff_part = tran.CalculateLowerIndexDiff(
+                to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
         });
 
         return idx_low_diff;
@@ -299,6 +337,12 @@ struct TransformedTensorDescriptor
         return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
     }
 
+    // TODO: remove this
+    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up)
+    {
+        return CalculateOffset(idx_up);
+    }
+
 #if 0
     template <index_t IDim>
     __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
@@ -321,6 +365,22 @@ struct TransformedTensorDescriptor
         // not implemented
     }
 #endif
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up)
+    {
+        bool flag = false;
+
+        static_for<0, nTransform, 1>{}([&](auto itran) {
+            constexpr auto tran = Transforms{}.At(itran);
+
+            const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
+
+            flag = flag || tran.IsUpperIndexInPaddingArea(to_array(idx_up_part));
+        });
+
+        return flag;
+    }
 };
 
 template <index_t... Lengths, index_t... Strides>
@@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths)
                                  Lengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
                                  .PushBack(Number<1>{});
 
-    return make_NativeTensorDescriptor(Lengths{}, strides);
+    return make_native_tensor_descriptor(Lengths{}, strides);
 }
 
 template <typename LowTensorDescriptor,
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 23b173f587..f3085ef0c2 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -7,6 +7,8 @@
 #include "tensor_coordinate.hpp"
 #include "tensor_view.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_coordinate_v2.hpp"
 
 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
@@ -418,6 +420,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
     }
 };
 
+// This version use TensorCoordiante
 // Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
 // memory layout (ordering of dimensions) can be different between src and dst.
 template <index_t BlockSize,
@@ -518,7 +521,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
     }
 
     private:
-    using RegisterBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
+    using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
 
     using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1<SrcDesc,
                                                                  RegisterBufferDesc,
@@ -544,6 +547,7 @@ struct BlockwiseGenericTensorSliceCopy_v2
     ThreadwiseStore mThreadwiseStore;
 };
 
+// this version use TensorView and TensorCoordinate
 template <index_t BlockSize,
           class SrcTensor,
           class DstTensor,
@@ -639,25 +643,25 @@ struct BlockwiseGenericTensorSliceCopy_v3
     using ThreadBufferDesc   = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
     using ThreadBufferTensor = NormalTensorView<ThreadBufferDesc, data_type>;
 
-    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3<SrcTensor,
-                                                               ThreadBufferTensor,
-                                                               SubLengths,
-                                                               SrcDimAccessOrder,
-                                                               SrcDimAccessOrder,
-                                                               SrcVectorAccessDim,
-                                                               SrcVectorAccessDim,
-                                                               SrcDataPerAccess,
-                                                               1>;
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1<SrcTensor,
+                                                                 ThreadBufferTensor,
+                                                                 SubLengths,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcDataPerAccess,
+                                                                 1>;
 
-    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3<ThreadBufferTensor,
-                                                                DstTensor,
-                                                                SubLengths,
-                                                                DstDimAccessOrder,
-                                                                DstDimAccessOrder,
-                                                                DstVectorAccessDim,
-                                                                DstVectorAccessDim,
-                                                                1,
-                                                                DstDataPerAccess>;
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1<ThreadBufferTensor,
+                                                                  DstTensor,
+                                                                  SubLengths,
+                                                                  DstDimAccessOrder,
+                                                                  DstDimAccessOrder,
+                                                                  DstVectorAccessDim,
+                                                                  DstVectorAccessDim,
+                                                                  1,
+                                                                  DstDataPerAccess>;
 
     data_type mpBuffer[ThreadBufferDesc::GetElementSpace()];
 
@@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3
     ThreadwiseStore mThreadwiseStore;
 };
 
+template <index_t BlockSize,
+          class SrcDesc,
+          class DstDesc,
+          class SliceLengths,
+          class SubLengths,
+          class ThreadClusterLengths,
+          class ThreadClusterArrangeOrder,
+          class SrcDimAccessOrder,
+          class DstDimAccessOrder,
+          index_t SrcVectorAccessDim,
+          index_t DstVectorAccessDim,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct BlockwiseGenericTensorSliceCopy_v4
+{
+    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
+
+    using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
+    using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
+
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
+                                                            DstCoord dst_block_slice_origin)
+    {
+        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
+                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
+                          nDim == SubLengths::Size() && nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
+                      "wrong! threads should be mapped to cover entire slicing window");
+
+        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
+            ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
+
+        static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
+                      "wrong! BlockSize not consistent with ThreadClusterLengths");
+
+        const auto thread_cluster_id =
+            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
+
+        const auto data_cluster_id =
+            reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
+
+        const auto thread_data_id_begin = data_cluster_id * SubLengths{};
+
+        mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
+        mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
+
+        mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
+        mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
+    }
+
+    __device__ static constexpr index_t GetRegisterBufferSize()
+    {
+        return RegisterBufferDesc::GetElementSpace();
+    }
+
+    template <class TData>
+    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
+    {
+        mThreadwiseLoad.Run(p_src, p_buffer);
+    }
+
+    template <class TData>
+    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
+    {
+        mThreadwiseStore.Run(p_buffer, p_dst);
+    }
+
+    template <class TData>
+    __device__ void Run(const TData* p_src, TData* p_dst) const
+    {
+        TData p_buffer[GetRegisterBufferSize()];
+
+        mThreadwiseLoad.Run(p_src, p_buffer);
+        mThreadwiseStore.Run(p_buffer, p_dst);
+    }
+
+    template <class T, bool PositiveDirection>
+    __device__ void
+    MoveSrcSlicingWindow(T step_sizes,
+                         integral_constant<bool, PositiveDirection> positive_direction)
+    {
+        mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction);
+    }
+
+    template <class T, bool PositiveDirection>
+    __device__ void
+    MoveDstSlicingWindow(T step_sizes,
+                         integral_constant<bool, PositiveDirection> positive_direction)
+    {
+        mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction);
+    }
+
+    private:
+    using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
+
+    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
+                                                                 RegisterBufferDesc,
+                                                                 SubLengths,
+                                                                 SrcDimAccessOrder,
+                                                                 SrcVectorAccessDim,
+                                                                 SrcDataPerAccess,
+                                                                 1>;
+
+    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
+                                                                  DstDesc,
+                                                                  SubLengths,
+                                                                  DstDimAccessOrder,
+                                                                  DstVectorAccessDim,
+                                                                  1,
+                                                                  DstDataPerAccess>;
+
+    ThreadwiseLoad mThreadwiseLoad;
+    ThreadwiseStore mThreadwiseStore;
+};
+
 } // namespace ck
 
 #endif
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index c0928c2bd3..3fba453a36 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -6,6 +6,8 @@
 #include "ConstantMergedTensorDescriptor.hpp"
 #include "tensor_coordinate.hpp"
 #include "tensor_view.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_coordinate_v2.hpp"
 
 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
@@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
     Array<index_t, nDim> mDstSliceOrigin;
 };
 
+// This version use TensorCoordinate
 // This threadwise copy allow vector access of src and dst.
 // It allows the dimensions of vector access to be different on src and dst.
 // It also allows the vector size to be different on src and dst.
@@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
     DstCoordinate mDstSliceOrigin;
 };
 
+// this version use TensorView and TensorCoordinate
 template <class SrcTensor,
           class DstTensor,
           class SliceLengths,
@@ -783,7 +787,7 @@ template <class SrcTensor,
           index_t DstVectorAccessDim,
           index_t SrcDataPerAccess,
           index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v3
+struct ThreadwiseGenericTensorSliceCopy_v3r1
 {
     static constexpr index_t nDim = SrcTensor::GetNumOfDimension();
     using data_type               = remove_cv_t<typename SrcTensor::data_type>;
@@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3
     using SrcCoordinate = typename SrcTensor::coordinate_type;
     using DstCoordinate = typename DstTensor::coordinate_type;
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3(SrcTensor src,
-                                                             SrcCoordinate src_slice_origin,
-                                                             DstTensor dst,
-                                                             DstCoordinate dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src,
+                                                               SrcCoordinate src_slice_origin,
+                                                               DstTensor dst,
+                                                               DstCoordinate dst_slice_origin)
         : mSrc{src},
           mDst{dst},
           mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})},
@@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3
                       "wrong! vectorized access is not allowed");
     }
 
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3()
-        : ThreadwiseGenericTensorSliceCopy_v3(
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1()
+        : ThreadwiseGenericTensorSliceCopy_v3r1(
               SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{})
     {
     }
@@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3
     DstSlice mDstSlice;
 };
 
+// This version use multi-index transformation
+// This threadwise copy allow vector access of src and dst.
+// It allows the vector size to be different on src and dst.
+// The dimensions of vector access should be the same on src and dst.
+// The dimension access order should be the same on src and dst.
+// It is designed for cases, where one of src and dst is register, and
+// the other is device memory or LDS
+template <class SrcDesc,
+          class DstDesc,
+          class SliceLengths,
+          class DimAccessOrder,
+          index_t VectorAccessDim,
+          index_t SrcDataPerAccess,
+          index_t DstDataPerAccess>
+struct ThreadwiseGenericTensorSliceCopy_v4r2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
+    using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin,
+                                                               DstCoord dst_slice_origin)
+        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
+    {
+        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
+                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! # of dimensions not the same");
+
+        static_assert(is_valid_sequence_map<DimAccessOrder>{}, "wrong! map is not valid");
+
+        static_assert(
+            SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
+            "wrong! cannot evenly divide");
+
+        // TODO:: sanity-check if vectorized memory access is allowed on src and dst
+    }
+
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2()
+        : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array<index_t, nDim>(),
+                                                make_zero_array<index_t, nDim>())
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin)
+    {
+        mSrcSliceOrigin = src_slice_origin;
+    }
+
+    __device__ void SetDstSliceOrigin(DstCoord dst_slice_origin)
+    {
+        mDstSliceOrigin = dst_slice_origin;
+    }
+
+    template <class TData>
+    __device__ void Run(const TData* p_src, TData* p_dst) const
+    {
+        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
+
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
+
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
+
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
+
+        ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
+            auto long_vector_access_id) {
+
+            // data id w.r.t slicing-window
+            auto long_vector_data_begin_id = long_vector_access_id;
+            long_vector_data_begin_id(vector_access_dim) =
+                long_vector_size * long_vector_access_id[vector_access_dim];
+
+            // buffer to hold a long-vector
+            TData p_long_vector[long_vector_size];
+
+            // set 0
+            for(index_t i = 0; i < long_vector_size; ++i)
+            {
+                p_long_vector[i] = 0;
+            }
+
+            // load data from src to the long-vector buffer
+            for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
+            {
+                auto scalar_id               = make_zero_array<index_t, nDim>();
+                scalar_id(vector_access_dim) = i * src_data_per_access;
+
+                const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
+
+                // check for padding
+                // TODO: still kind of messy
+                if(!src_coord.IsAnyLevelIndexInPaddingArea())
+                {
+                    const index_t src_offset =
+                        (mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
+
+                    const index_t buffer_offset = i * src_data_per_access;
+
+                    *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                        *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+                }
+            }
+
+            // store data from the long-vector buffer to dst
+            for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
+            {
+                auto scalar_id               = make_zero_array<index_t, nDim>();
+                scalar_id(vector_access_dim) = i * dst_data_per_access;
+
+                const index_t buffer_offset = i * dst_data_per_access;
+
+                const index_t dst_offset =
+                    (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset();
+
+                *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
+                    *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
+            }
+        });
+    }
+
+    template <class T, bool PositiveDirection>
+    __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    {
+        static_if<PositiveDirection>{}([&](auto) {
+            mSrcSliceOrigin += step_sizes;
+        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
+    }
+
+    template <class T, bool PositiveDirection>
+    __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
+    {
+        static_if<PositiveDirection>{}([&](auto) {
+            mDstSliceOrigin += step_sizes;
+        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
+    }
+
+    private:
+    SrcCoord mSrcSliceOrigin;
+    DstCoord mDstSliceOrigin;
+};
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp
index 52e92da3f1..7b0c25d41b 100644
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -23,20 +23,9 @@ struct Array
         static_assert(sizeof...(Xs) + 1 == NSize, "wrong! size");
     }
 
-#if 0
-    template <typename T>
-    __host__ __device__ explicit constexpr Array(const T& x)
-    {
-        static_assert(T::Size() == NSize, "wrong! size");
-
-        static_for<0, NSize, 1>{}([&](auto i){
-           mData[i] = x.At(i); 
-        })
-    }
-#endif
-
     __host__ __device__ static constexpr index_t Size() { return NSize; }
 
+    // TODO: remove
     __host__ __device__ static constexpr index_t GetSize() { return Size(); }
 
     template <index_t I>
@@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array<TData, NSize>& old_
     return new_array;
 }
 
-template <typename F, typename X, typename Y, typename Z> // emulate constepxr lambda for array
-// math
+// emulate constepxr lambda for array
+template <typename F, typename X, typename Y, typename Z>
 struct lambda_array_math
 {
     const F& f;
diff --git a/composable_kernel/include/utility/array_helper.hpp b/composable_kernel/include/utility/array_helper.hpp
index a3536309fa..7608a195a3 100644
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -5,8 +5,8 @@
 
 namespace ck {
 
-template <typename T, index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<T, NSize> a)
+template <index_t NSize>
+__host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
 {
     constexpr index_t nsize = a.GetSize();
 
@@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array<T, NSize> a)
     });
 }
 
+template <index_t NSize>
+__host__ __device__ void print_array(const char* s, Array<signed_t, NSize> a)
+{
+    constexpr index_t nsize = a.GetSize();
+
+    static_assert(nsize > 0 && nsize <= 10, "wrong!");
+
+    static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
+
+    static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
+
+    static_if<nsize == 3>{}(
+        [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
+
+    static_if<nsize == 4>{}(
+        [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
+
+    static_if<nsize == 5>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
+    });
+
+    static_if<nsize == 6>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
+    });
+
+    static_if<nsize == 7>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6]);
+    });
+
+    static_if<nsize == 8>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7]);
+    });
+
+    static_if<nsize == 9>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8]);
+    });
+
+    static_if<nsize == 10>{}([&](auto) {
+        printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
+               s,
+               nsize,
+               a[0],
+               a[1],
+               a[2],
+               a[3],
+               a[4],
+               a[5],
+               a[6],
+               a[7],
+               a[8],
+               a[9]);
+    });
+}
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 0a4b43d1a6..05be2489bb 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -15,6 +15,15 @@
 
 namespace ck {
 
+using unsigned_t = uint32_t;
+using signed_t   = int;
+
+#if 0 // debug
+using index_t = unsigned_t;
+#else
+using index_t = signed_t;
+#endif
+
 // For some reason, CUDA need this definition, otherwise
 //   compiler won't generate optimal load and store instruction, and
 //   kernel would produce wrong result, indicating the compiler fail to generate correct
@@ -22,8 +31,6 @@ namespace ck {
 using float2_t = float2;
 using float4_t = float4;
 
-using index_t = uint32_t;
-
 template <class T>
 __device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
 {
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
index 8a9fff5979..37754cca20 100644
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -537,11 +537,9 @@ struct sequence_unique_sort
 };
 
 template <typename SeqMap>
-struct is_valid_sequence_map
+struct is_valid_sequence_map : is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
+                                       typename sequence_sort<SeqMap, math::less<index_t>>::type>
 {
-    static constexpr bool value =
-        is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
-                typename sequence_sort<SeqMap, math::less<index_t>>::type>{};
 };
 
 template <typename SeqMap>
diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
index 857924db25..c66f5f2bfc 100644
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp
@@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc,
     constexpr index_t OutThreadCopyDataPerAccess_N = 4;
 #endif
 
-#if 0 // debug
+#if 1 // debug
     constexpr index_t GridSize =
         (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
 #else
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index b20fc26f78..00ce2079f6 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -73,25 +73,10 @@ int main(int argc, char* argv[])
     using namespace ck;
 
 #if 1
-    constexpr index_t N  = 10;
-    constexpr index_t C  = 10;
-    constexpr index_t HI = 10;
-    constexpr index_t WI = 10;
-    constexpr index_t K  = 10;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
-
-    using ConvStrides   = Sequence<1, 1>;
-    using ConvDilations = Sequence<1, 1>;
-
-    constexpr index_t HPad = 3;
-    constexpr index_t WPad = 3;
-#elif 1
-    // 3x3, 34x34
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
+    constexpr index_t N  = 32;
+    constexpr index_t C  = 8;
+    constexpr index_t HI = 2;
+    constexpr index_t WI = 2;
     constexpr index_t K  = 128;
     constexpr index_t Y  = 3;
     constexpr index_t X  = 3;
@@ -99,8 +84,23 @@ int main(int argc, char* argv[])
     using ConvStrides   = Sequence<1, 1>;
     using ConvDilations = Sequence<1, 1>;
 
-    constexpr index_t HPad = 0;
-    constexpr index_t WPad = 0;
+    constexpr index_t HPad = 1;
+    constexpr index_t WPad = 1;
+#elif 1
+    // 3x3, 34x34
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 32;
+    constexpr index_t WI = 32;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    constexpr index_t HPad = 1;
+    constexpr index_t WPad = 1;
 #elif 0
     // 1x1 filter, 8x8 image
     // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-#if 1
+#if 0
         if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
            ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
         {