From 545d9305687c083717274171fdb22c74a5b5615e Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 24 Sep 2019 18:06:05 -0500
Subject: [PATCH] refactor

---
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |  2 +-
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  2 +-
 ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp |  2 +-
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 40 ++++++-----
 ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp |  2 +-
 ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp |  2 +-
 ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp |  2 +-
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 66 ++++++++++++-----
 .../ConstantTensorDescriptor.hpp              | 11 ++-
 .../multi_index_transform.hpp                 | 70 ++++++++++---------
 .../tensor_coordinate_helper.hpp              | 16 +++++
 .../tensor_coordinate_v2.hpp                  | 19 ++---
 .../tensor_description/tensor_descriptor.hpp  | 33 ++++++---
 .../blockwise_3d_tensor_op.hpp                |  2 +-
 .../blockwise_4d_tensor_op.hpp                |  2 +-
 .../blockwise_generic_tensor_slice_copy.hpp   | 52 ++++++--------
 .../threadwise_generic_tensor_slice_copy.hpp  | 66 +++++++----------
 composable_kernel/include/utility/array.hpp   |  3 +-
 .../include/utility/functional.hpp            |  6 ++
 .../include/utility/sequence.hpp              | 36 ++++++++--
 ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 48 ++++++++++++-
 ...plicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp |  2 +-
 driver/src/driver.cpp                         |  4 +-
 23 files changed, 301 insertions(+), 187 deletions(-)
 create mode 100644 composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index 9ad3f8148e..379ca76970 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
         constexpr index_t E = C * Y * X;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 7e5c727d15..42931a6ae8 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -100,7 +100,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         constexpr index_t E = C * Y * X;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index 6f92b30617..a124a67e2e 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -107,7 +107,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
         constexpr index_t E = C * Y * X;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index be747eaca3..4d941b5053 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -107,7 +107,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr index_t E = C * Y * X;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopySrcDataPerRead_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
@@ -174,9 +174,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                decltype(in_e_n1_b_n2_global_desc),
                                                decltype(in_e_n1_b_n2_block_desc),
                                                Sequence<0, 1, 0, 1>,
-                                               Sequence<1, 0, 1, 0>,
                                                Sequence<1, 1, 1, 1>,
-                                               Sequence<0, 0, 0, 0>,
                                                decltype(in_e_n1_b_n2_block_desc.GetLengths()),
                                                InBlockCopySubLengths_E_N1_B_N2,
                                                InBlockCopyClusterLengths_E_N1_B_N2,
@@ -219,9 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                decltype(wei_e_k_global_desc),
                                                decltype(wei_e_k_block_desc),
                                                Sequence<1, 1>,
-                                               Sequence<0, 0>,
                                                Sequence<1, 1>,
-                                               Sequence<0, 0>,
                                                decltype(wei_e_k_block_desc.GetLengths()),
                                                WeiBlockCopySubLengths_E_K,
                                                WeiBlockCopyClusterLengths_E_K,
@@ -299,8 +295,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.Run(p_in_global, p_in_block_double);
-            blockwise_wei_copy.Run(p_wei_global, p_wei_block_double);
+            blockwise_in_copy.template Run<Float, address_space_t::global, address_space_t::lds>(
+                p_in_global, p_in_block_double);
+            blockwise_wei_copy.template Run<Float, address_space_t::global, address_space_t::lds>(
+                p_wei_global, p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -331,15 +329,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-                blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
+                blockwise_in_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                    p_in_global, p_in_register_buffer);
+                blockwise_wei_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                    p_wei_global, p_wei_register_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
 
                 // LDS double buffer: store next data to LDS
-                blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
-                blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
+                blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                    p_in_register_buffer, p_in_block_next);
+                blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                    p_wei_register_buffer, p_wei_block_next);
             }
         }
 
@@ -355,17 +357,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-            blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
+            blockwise_in_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                p_in_global, p_in_register_buffer);
+            blockwise_wei_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                p_wei_global, p_wei_register_buffer);
 
             // LDS double buffer: GEMM on current data
             blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
             // LDS double buffer: store next data to LDS
-            blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
-                                                     p_in_block_double + in_block_space);
-            blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
-                                                      p_wei_block_double + wei_block_space);
+            blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                p_in_register_buffer, p_in_block_double + in_block_space);
+            blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                p_wei_register_buffer, p_wei_block_double + wei_block_space);
 
             // odd iteration
             __syncthreads();
@@ -424,9 +428,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
                                                   decltype(out_k0_k1_n1_b_n2_global_desc),
                                                   Sequence<1, 1, 1, 1, 1>,
-                                                  Sequence<0, 0, 0, 0, 0>,
                                                   Sequence<1, 1, 1, 0, 1>,
-                                                  Sequence<0, 0, 0, 1, 0>,
                                                   decltype(
                                                       out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 5, 1>::type,
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 1ec519aad6..a3ecf64941 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -84,7 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
         constexpr index_t B = N * Ho * Wo;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
index ad411be6db..1bc814c009 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -84,7 +84,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
         constexpr index_t B = N * Ho * Wo;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
index f3976d43bc..fbe387e788 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
         constexpr index_t B = N * Ho * Wo;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index 2172751e6e..0305a87924 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -90,7 +90,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
         constexpr index_t B = N * Ho * Wo;
 
         // sanity-check for vectorized memory load
-        static_assert((Ho == 1 || ConvStrideW % InBlockCopyDataPerAccess_B == 0) &&
+        static_assert((Wo == 1 || (ConvStrideW == 1 || InBlockCopyDataPerAccess_B == 1)) &&
                           (X == 1 || ConvDilationW % InBlockCopyDataPerAccess_B == 0),
                       "wrong! aligment requirement for vectorized global load of input tensor will "
                       "be violated");
@@ -145,6 +145,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(in_e_b_global_desc),
                                                decltype(in_e_b_block_desc),
+                                               Sequence<0, 0>,
+                                               Sequence<1, 1>,
                                                decltype(in_e_b_block_desc.GetLengths()),
                                                InBlockCopySubLengths_E_B,
                                                InBlockCopyClusterLengths_E_B,
@@ -157,13 +159,21 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                                                InBlockCopyDataPerAccess_B>(
                 {0, b_block_data_on_global}, {0, 0});
 
-        // weight tensor
-        //   global mem
+// weight tensor
+//   global mem
+#if 0
         constexpr auto wei_e_k_global_desc =
             transform_tensor_descriptor(wei_k_c_y_x_global_desc,
                                         make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                         make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                         make_tuple(Sequence<0>{}, Sequence<1>{}));
+#else // hack
+        constexpr auto wei_e_k_global_desc_old =
+            WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+
+        constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
+            wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
+#endif
 
         //   LDS
         //     be careful of LDS alignment
@@ -176,6 +186,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(wei_e_k_global_desc),
                                                decltype(wei_e_k_block_desc),
+                                               Sequence<1, 1>,
+                                               Sequence<1, 1>,
                                                decltype(wei_e_k_block_desc.GetLengths()),
                                                WeiBlockCopySubLengths_E_K,
                                                WeiBlockCopyClusterLengths_E_K,
@@ -253,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
 
         // LDS double buffer: preload data into LDS
         {
-            blockwise_in_copy.Run(p_in_global, p_in_block_double);
-            blockwise_wei_copy.Run(p_wei_global, p_wei_block_double);
+            blockwise_in_copy.template Run<Float, address_space_t::global, address_space_t::lds>(
+                p_in_global, p_in_block_double);
+            blockwise_wei_copy.template Run<Float, address_space_t::global, address_space_t::lds>(
+                p_wei_global, p_wei_block_double);
         }
 
         // LDS double buffer: main body
@@ -285,15 +299,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                 __syncthreads();
 
                 // LDS doubel buffer: load next data from device mem
-                blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-                blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
+                blockwise_in_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                    p_in_global, p_in_register_buffer);
+                blockwise_wei_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                    p_wei_global, p_wei_register_buffer);
 
                 // LDS double buffer: GEMM on current data
                 blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);
 
                 // LDS double buffer: store next data to LDS
-                blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next);
-                blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next);
+                blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                    p_in_register_buffer, p_in_block_next);
+                blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                    p_wei_register_buffer, p_wei_block_next);
             }
         }
 
@@ -309,17 +327,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
             __syncthreads();
 
             // LDS doubel buffer: load next data from device mem
-            blockwise_in_copy.RunLoadRegisterBuffer(p_in_global, p_in_register_buffer);
-            blockwise_wei_copy.RunLoadRegisterBuffer(p_wei_global, p_wei_register_buffer);
+            blockwise_in_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                p_in_global, p_in_register_buffer);
+            blockwise_wei_copy.template RunLoadRegisterBuffer<Float, address_space_t::global>(
+                p_wei_global, p_wei_register_buffer);
 
             // LDS double buffer: GEMM on current data
             blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);
 
             // LDS double buffer: store next data to LDS
-            blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer,
-                                                     p_in_block_double + in_block_space);
-            blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer,
-                                                      p_wei_block_double + wei_block_space);
+            blockwise_in_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                p_in_register_buffer, p_in_block_double + in_block_space);
+            blockwise_wei_copy.template RunStoreRegisterBuffer<Float, address_space_t::lds>(
+                p_wei_register_buffer, p_wei_block_double + wei_block_space);
 
             // odd iteration
             __syncthreads();
@@ -367,9 +387,11 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                 make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
 
             // output threadwise copy
-            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v4r2<
+            ThreadwiseGenericTensorSliceCopy_v4r2<
                 decltype(out_k0_k1_b0_b1_thread_desc),
                 decltype(out_k0_k1_b0_b1_global_desc),
+                Sequence<1, 1, 1, 1>,
+                Sequence<1, 1, 0, 0>,
                 decltype(out_k0_k1_b0_b1_thread_desc.GetLengths()),
                 arithmetic_sequence_gen<0, 4, 1>::type,
                 3,
@@ -378,9 +400,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
                                               {k_thread_data_on_global / K1,
                                                k_thread_data_on_global % K1,
                                                b_thread_data_on_global / B1,
-                                               b_thread_data_on_global % B1});
-
-            threadwise_out_copy.Run(p_out_thread, p_out_global);
+                                               b_thread_data_on_global % B1})
+#if 1
+                .template Run_generic<Float, address_space_t::generic, address_space_t::global>
+#elif 1
+                .template Run_optimized_dst_address_calculation<Float,
+                                                                address_space_t::vgpr,
+                                                                address_space_t::global>
+#endif
+                (p_out_thread, p_out_global);
         }
     }
 };
diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
index 65a7b219f6..6dfbe5f794 100644
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor
 
     __host__ __device__ static constexpr auto GetElementSize()
     {
-        return Number<accumulate_on_sequence(
-            Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
+        return Number<reduce_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
     }
 
     __host__ __device__ static constexpr auto GetElementSpace()
     {
-        constexpr index_t element_space_unaligned = accumulate_on_sequence(
+        constexpr index_t element_space_unaligned = reduce_on_sequence(
             (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
 
         return Number<element_space_unaligned>{};
@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor
 
         constexpr auto multi_id = Sequence<Is...>{};
 
-        return Number<accumulate_on_sequence(
+        return Number<reduce_on_sequence(
             multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{};
     }
 
@@ -389,7 +388,7 @@ struct ConstantTensorDescriptor
         constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
 
         constexpr index_t fold_intervals_product =
-            accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
 
         constexpr auto unfold_length = GetLength(Number<IDim>{});
         constexpr auto unfold_stride = GetStride(Number<IDim>{});
@@ -447,7 +446,7 @@ struct ConstantTensorDescriptor
         static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");
 
         // unfolded length, stride
-        constexpr index_t unfold_length = accumulate_on_sequence(
+        constexpr index_t unfold_length = reduce_on_sequence(
             GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
 
         constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp
index 47f2d97089..6acbe55233 100644
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -41,11 +41,10 @@ struct PassThrough
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
 
-    // TODO: should this function be here? should it be specific for padding check?
     __host__ __device__ static constexpr bool
-    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
     {
-        return false;
+        return true;
     }
 };
 
@@ -82,23 +81,38 @@ struct Pad
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
 
-    // TODO: should this function be here? should it be specific for padding check?
-    __host__ __device__ constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) const
+    __host__ __device__ constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
     {
-        bool flag = false;
+#if 0
+        struct lambda_no_pad
+        {
+            __host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; }
+        };
 
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            // only check if there is left-padding
-            static_if<(LeftPads::At(idim) != 0)>{}(
-                [&](auto) { flag = flag || idx_up[idim] < LeftPads::At(idim); });
+        if(sequence_all_of(LeftPads{}, lambda_no_pad{}) &&
+           sequence_all_of(RightPads{}, lambda_no_pad{}))
+        {
+            return true;
+        }
+        else
+#endif
+        {
+            bool flag = true;
 
-            // only check if there is right-padding
-            static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
-                flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowerLengths::At(idim);
+            static_for<0, nDim, 1>{}([&](auto idim) {
+                // only check if there is left-padding
+                static_if<(LeftPads::At(idim) != 0)>{}(
+                    [&](auto) { flag = flag && idx_up[idim] >= LeftPads::At(idim); });
+
+                // only check if there is right-padding
+                static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
+                    flag = flag && (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim));
+                });
             });
-        });
 
-        return flag;
+            return flag;
+        }
     }
 };
 
@@ -155,16 +169,10 @@ struct Merge
                 LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
                 .PushBack(Number<1>{});
 
-#if 1 // would these 2 versions be compiled to same ISA?
-        // calculate index in each of the dimensions in the order of their dimension
         static_for<0, nDimLow - 1, 1>{}(
             lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
 
         idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
-#else
-        static_for<0, nDimLow, 1>{}(
-            lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
-#endif
 
         return idx_low;
     }
@@ -244,6 +252,7 @@ struct Merge
             });
 
             // highest dimension, no out-of-bound check
+
             if(borrow)
             {
                 --idx_low_new(0);
@@ -255,11 +264,10 @@ struct Merge
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
 
-    // TODO: should this function be here? should it be specific for padding check?
     __host__ __device__ static constexpr bool
-    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
     {
-        return false;
+        return true;
     }
 };
 
@@ -304,11 +312,10 @@ struct Unmerge
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
 
-    // TODO: should this function be here? should it be specific for padding check?
     __host__ __device__ static constexpr bool
-    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
     {
-        return false;
+        return true;
     }
 };
 
@@ -362,9 +369,9 @@ struct Embed
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
 
     __host__ __device__ static constexpr bool
-    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
     {
-        return false;
+        return true;
     }
 };
 
@@ -404,11 +411,10 @@ struct Vectorize
 
     __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
 
-    // TODO: should this function be here? should it be specific for padding check?
     __host__ __device__ static constexpr bool
-    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
     {
-        return false;
+        return true;
     }
 };
 
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
new file mode 100644
index 0000000000..2b0550f8ab
--- /dev/null
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
@@ -0,0 +1,16 @@
+#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
+#define CK_TENSOR_COORDINATE_HELPER_HPP
+
+#include "tensor_coordiante_v2.hpp"
+
+namespace ck {
+
+template <typename TensorDesc>
+__host__ __device__ constexpr auto
+make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
+{
+    return typename TensorCoordinate_v2<TensorDesc>::type(idx);
+}
+
+} // namespace ck
+#endif
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index 62dc8b4c9a..b88b52671a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -76,8 +76,7 @@ struct NativeTensorCoordinate
         return coord;
     }
 
-    // TODO: should this function be here? should it be specific for padding check?
-    __host__ __device__ static constexpr bool IsAnyLevelIndexInPaddingArea() { return false; }
+    __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
 
     private:
     // mIndex may be saved and update, however, the value of some (or all) of its entries may
@@ -166,11 +165,11 @@ struct TransformedTensorCoordinate
         return coord_up;
     }
 
-    // TODO: should this function be here? should it be specific for padding check?
-    __host__ __device__ constexpr bool IsAnyLevelIndexInPaddingArea() const
+    // this function should be inexpensive, because there is no upper-to-lower index transformation
+    __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
     {
-        return tensor_desc_type::IsUpperIndexInPaddingArea(GetIndex()) ||
-               mCoordLow.IsAnyLevelIndexInPaddingArea();
+        return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
+               mCoordLow.IsUpperIndexMappedToValidOffset();
     }
 
     private:
@@ -206,11 +205,5 @@ struct TensorCoordinate_v2
     using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
 };
 
-template <typename TensorDesc>
-__host__ __device__ constexpr auto
-make_tensor_coordinate_v2(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
-{
-    return typename TensorCoordinate_v2<TensorDesc>::type(idx);
-}
-}
+} // namespace ck
 #endif
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index f307286f70..dc2039355c 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -66,12 +66,12 @@ struct NativeTensorDescriptor
 
     __host__ __device__ static constexpr index_t GetElementSize()
     {
-        return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
+        return reduce_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
     }
 
     __host__ __device__ static constexpr index_t GetElementSpace()
     {
-        return accumulate_on_sequence(
+        return reduce_on_sequence(
             (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
     }
 
@@ -120,10 +120,10 @@ struct NativeTensorDescriptor
     }
 #endif
 
-    // TODO: should this function be here? should it be specific for padding check?
-    __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidOffset(const Index& /* idx */)
     {
-        return false;
+        return true;
     }
 };
 
@@ -290,7 +290,7 @@ struct TransformedTensorDescriptor
 
     __host__ __device__ static constexpr index_t GetElementSize()
     {
-        return accumulate_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
+        return reduce_on_sequence(GetLengths(), math::multiplies<index_t>{}, Number<1>{});
     }
 
     __host__ __device__ static constexpr index_t GetElementSpace()
@@ -375,7 +375,7 @@ struct TransformedTensorDescriptor
             constexpr bool is_linear_transform = tran.IsLinearTransform();
 
             // judge if all lower dimension are linear
-            constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence(
+            constexpr bool is_all_low_dim_linear = math::reduce_on_sequence(
                 pick_sequence_elements_by_mask(
                     GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
                 math::logic_and<bool>{},
@@ -441,21 +441,32 @@ struct TransformedTensorDescriptor
         }
 #endif
 
-    // TODO: should this function be here? should it be specific for padding check?
-    __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up)
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
     {
-        bool flag = false;
+        bool flag = true;
 
         static_for<0, nTransform, 1>{}([&](auto itran) {
             constexpr auto tran = Transforms{}.At(itran);
 
             const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
 
-            flag = flag || tran.IsUpperIndexInPaddingArea(to_array(idx_up_part));
+            flag = flag && tran.IsUpperIndexMappedToValidLowerIndex(to_array(idx_up_part));
         });
 
         return flag;
     }
+
+    // Whenever this function is called, it will call CalculateLowerIndex() recursively
+    // If you have created a tensor coordinate already, instead of calling this function,
+    //   you should call TransformedTensorCoordinate::IsUpperIndexMappedToValidOffset()
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up)
+    {
+        return IsUpperIndexMappedToValidLowerIndex(idx_up) &&
+               GetLowerTensorDescriptor().IsUpperIndexMappedToValidOffset(
+                   CalculateLowerIndex(idx_up));
+    }
 };
 
 } // namespace ck
diff --git a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
index 0c4ed9d5c1..9ba37acd6d 100644
--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -162,7 +162,7 @@ struct Blockwise3dTensorCopy3
                       "wrrong! BlockSize is not big enough for ThreadPerDims!");
 
         constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
 
         if(BlockSize > num_active_thread)
         {
diff --git a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
index 4185e066fb..93e859469e 100644
--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -505,7 +505,7 @@ struct Blockwise4dTensorCopy3
                       "wrrong! BlockSize is not big enough for ThreadPerDims!");
 
         constexpr index_t num_active_thread =
-            accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
 
         if(BlockSize > num_active_thread)
         {
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index e4a9af67be..7076e26d5b 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -681,9 +681,7 @@ template <index_t BlockSize,
           typename SrcDesc,
           typename DstDesc,
           typename SrcLinearDimensionMask,
-          typename SrcNonLinearDimensionMask,
           typename DstLinearDimensionMask,
-          typename DstNonLinearDimensionMask,
           typename SliceLengths,
           typename SubLengths,
           typename ThreadClusterLengths,
@@ -738,45 +736,43 @@ struct BlockwiseGenericTensorSliceCopy_v4
         return RegisterBufferDesc::GetElementSpace();
     }
 
-    template <typename TData>
+    template <typename TData, address_space_t SrcAddressSpace = address_space_t::generic>
     __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
     {
-#if 0
-        mThreadwiseLoad.Run_generic(p_src, p_buffer);
-#elif 1
-        // hardcoded: src is global memory
-        mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
-#elif 1
-        // hardcoded: src is global memory
-        mThreadwiseLoad
-            .template Run_optimized_src_address_calculation<TData, address_space_t::global>(
-                p_src, p_buffer);
+#if 1
+        mThreadwiseLoad.template Run_generic<TData, SrcAddressSpace, address_space_t::vgpr>(
+            p_src, p_buffer);
+#else
+        mThreadwiseLoad.template Run_optimized_src_address_calculation<TData,
+                                                                       SrcAddressSpace,
+                                                                       address_space_t::vgpr>(
+            p_src, p_buffer);
 #endif
     }
 
-    template <typename TData>
+    template <typename TData, address_space_t DstAddressSpace = address_space_t::generic>
     __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
     {
-#if 0
-        mThreadwiseStore.Run_generic(p_buffer, p_dst);
-#elif 1
-        // hardcoded: dst is lds
-        mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
-#elif 1
-        // hardcoded: dst is lds
-        mThreadwiseStore
-            .template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
+#if 1
+        mThreadwiseStore.template Run_generic<TData, address_space_t::vgpr, DstAddressSpace>(
+            p_buffer, p_dst);
+#else
+        mThreadwiseStore.template Run_optimized_dst_address_calculation<TData,
+                                                                        address_space_t::vgpr,
+                                                                        DstAddressSpace>(p_buffer,
                                                                                          p_dst);
 #endif
     }
 
-    template <typename TData>
+    template <typename TData,
+              address_space_t SrcAddressSpace = address_space_t::generic,
+              address_space_t DstAddressSpace = address_space_t::generic>
     __device__ void Run(const TData* p_src, TData* p_dst) const
     {
         TData p_buffer[GetRegisterBufferSize()];
 
-        RunLoadRegisterBuffer(p_src, p_buffer);
-        RunStoreRegisterBuffer(p_buffer, p_dst);
+        RunLoadRegisterBuffer<TData, SrcAddressSpace>(p_src, p_buffer);
+        RunStoreRegisterBuffer<TData, DstAddressSpace>(p_buffer, p_dst);
     }
 
     template <typename T, bool PositiveDirection>
@@ -802,9 +798,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
         ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
                                               RegisterBufferDesc,
                                               SrcLinearDimensionMask,
-                                              SrcNonLinearDimensionMask,
                                               typename uniform_sequence_gen<nDim, 1>::type,
-                                              typename uniform_sequence_gen<nDim, 0>::type,
                                               SubLengths,
                                               SrcDimAccessOrder,
                                               SrcVectorAccessDim,
@@ -815,9 +809,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
         ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
                                               DstDesc,
                                               typename uniform_sequence_gen<nDim, 1>::type,
-                                              typename uniform_sequence_gen<nDim, 0>::type,
                                               DstLinearDimensionMask,
-                                              DstNonLinearDimensionMask,
                                               SubLengths,
                                               DstDimAccessOrder,
                                               DstVectorAccessDim,
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 99148042f2..ef43b3d380 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1131,9 +1131,7 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
 template <typename SrcDesc,
           typename DstDesc,
           typename SrcLinearDimensionMask,
-          typename SrcNonLinearDimensionMask,
           typename DstLinearDimensionMask,
-          typename DstNonLinearDimensionMask,
           typename SliceLengths,
           typename DimAccessOrder,
           index_t VectorAccessDim,
@@ -1231,8 +1229,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                 // Check src vector's padding situation, only check the first data in this src
                 //   vector. It's user's responsiblity to make sure all data in the src vector has
                 //   the same padding situation
-                // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary
-                if(!src_coord.IsAnyLevelIndexInPaddingArea())
+                if(src_coord.IsUpperIndexMappedToValidOffset())
                 {
                     static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
@@ -1260,13 +1257,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 
                 const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id);
 
-// Check dst vector's padding situation, only check the first data in this dst
-//   vector. It's user's responsiblity to make sure all data in the dst vector has
-//   the same padding situation
-// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary
-#if 0 // tuning
-                if(!dst_coord.IsAnyLevelIndexInPaddingArea())
-#endif
+                // Check dst vector's padding situation, only check the first data in this dst
+                //   vector. It's user's responsiblity to make sure all data in the dst vector has
+                //   the same padding situation
+                if(dst_coord.IsUpperIndexMappedToValidOffset())
                 {
                     static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
@@ -1303,7 +1297,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on src data: Read 0 if src data is in padding area.
     // Will do padding check on dst data: No write if dst data is in paddin area.
     // This version is optimized for address calculation of src tensor
-    template <typename TData, address_space_t SrcAddressSpace = address_space_t::generic>
+    template <typename TData,
+              address_space_t SrcAddressSpace = address_space_t::generic,
+              address_space_t DstAddressSpace = address_space_t::generic>
     __device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const
     {
         using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
@@ -1321,8 +1317,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 
         // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask()
         //   is implemented
-        constexpr auto src_linear_dim_mask    = SrcLinearDimensionMask{};
-        constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
+        constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{};
+        constexpr auto src_nonlinear_dim_mask =
+            SrcLinearDimensionMask::Transform(logical_not<index_t>{});
 
         static_assert(
             src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess,
@@ -1392,9 +1389,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     // Check src vector's padding situation, only check the first data in
                     //   this src vector. It's user's responsiblity to make sure all data in
                     //   the src vector has the same padding situation
-                    // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
-                    // neccessary
-                    if(!src_coord.IsAnyLevelIndexInPaddingArea())
+                    if(src_coord.IsUpperIndexMappedToValidOffset())
                     {
                         static_if<SrcAddressSpace == address_space_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
@@ -1427,14 +1422,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps +
                                                               linear_dim_data_steps + scalar_id);
 
-// Check dst vector's padding situation, only check the first data in
-//   this dst vector. It's user's responsiblity to make sure all data in
-//   the dst vector has the same padding situation
-// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
-// neccessary
-#if 0 // tuning
-                    if(!dst_coord.IsAnyLevelIndexInPaddingArea())
-#endif
+                    // Check dst vector's padding situation, only check the first data in
+                    //   this dst vector. It's user's responsiblity to make sure all data in
+                    //   the dst vector has the same padding situation
+                    if(dst_coord.IsUpperIndexMappedToValidOffset())
                     {
                         *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
                             *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
@@ -1450,7 +1441,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
     // Will do padding check on src data: Read 0 if src data is in padding area.
     // Will do padding check on dst data: No write if dst data is in paddin area.
     // This version is optimized for address calculation of dst tensor
-    template <typename TData, address_space_t DstAddressSpace = address_space_t::generic>
+    template <typename TData,
+              address_space_t SrcAddressSpace = address_space_t::generic,
+              address_space_t DstAddressSpace = address_space_t::generic>
     __device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const
     {
         using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
@@ -1468,8 +1461,9 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 
         // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask()
         //   is implemented
-        constexpr auto dst_linear_dim_mask    = DstLinearDimensionMask{};
-        constexpr auto dst_nonlinear_dim_mask = DstNonLinearDimensionMask{};
+        constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{};
+        constexpr auto dst_nonlinear_dim_mask =
+            DstLinearDimensionMask::Transform(logical_not<index_t>{});
 
         static_assert(
             dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess,
@@ -1535,9 +1529,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     // Check src vector's padding situation, only check the first data in
                     //   this src vector. It's user's responsiblity to make sure all data in
                     //   the src vector has the same padding situation
-                    // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
-                    // neccessary
-                    if(!src_coord.IsAnyLevelIndexInPaddingArea())
+                    if(src_coord.IsUpperIndexMappedToValidOffset())
                     {
                         *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
                             *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
@@ -1561,14 +1553,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     const index_t dst_linear_offset =
                         dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
 
-// Check dst vector's padding situation, only check the first data in
-//   this dst vector. It's user's responsiblity to make sure all data in
-//   the dst vector has the same padding situation
-// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is
-// neccessary
-#if 0 // tuning
-                    if(!dst_coord.IsAnyLevelIndexInPaddingArea())
-#endif
+                    // Check dst vector's padding situation, only check the first data in
+                    //   this dst vector. It's user's responsiblity to make sure all data in
+                    //   the dst vector has the same padding situation
+                    if(dst_coord.IsUpperIndexMappedToValidOffset())
                     {
                         static_if<DstAddressSpace == address_space_t::global>{}([&](auto) {
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp
index b0ffa86785..213b20530d 100644
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -110,8 +110,7 @@ struct ArrayElementPicker
 
     __host__ __device__ explicit constexpr ArrayElementPicker(Arr& array) : mArray{array}
     {
-        constexpr index_t imax =
-            accumulate_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
+        constexpr index_t imax = reduce_on_sequence(Picks{}, math::maxer<index_t>{}, Number<0>{});
 
         static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
     }
diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp
index 3dd469c8bc..6232e2243f 100644
--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -25,6 +25,12 @@ struct swallow
     }
 };
 
+template <typename T>
+struct logical_not
+{
+    constexpr bool operator()(const T& x) const { return !x; }
+};
+
 // Emulate if constexpr
 template <bool>
 struct static_if;
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
index 55bda8aede..3ccc8c5f65 100644
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -764,12 +764,12 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask)
 #endif
 
 template <typename Seq, typename Reduce>
-struct lambda_accumulate_on_sequence
+struct lambda_reduce_on_sequence
 {
     const Reduce& f;
     index_t& result;
 
-    __host__ __device__ constexpr lambda_accumulate_on_sequence(const Reduce& f_, index_t& result_)
+    __host__ __device__ constexpr lambda_reduce_on_sequence(const Reduce& f_, index_t& result_)
         : f(f_), result(result_)
     {
     }
@@ -783,14 +783,42 @@ struct lambda_accumulate_on_sequence
 
 template <typename Seq, typename Reduce, index_t Init>
 __host__ __device__ constexpr index_t
-accumulate_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
+reduce_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
 {
     index_t result = Init;
 
-    static_for<0, Seq::mSize, 1>{}(lambda_accumulate_on_sequence<Seq, Reduce>(f, result));
+    static_for<0, Seq::Size(), 1>{}(lambda_reduce_on_sequence<Seq, Reduce>(f, result));
 
     return result;
 }
 
+// TODO: a generic any_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/)
+{
+    bool flag = false;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag || f(Seq::At(i));
+    }
+
+    return flag;
+}
+
+// TODO: a generic all_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_all_of(Seq, F f /*initial_value*/)
+{
+    bool flag = true;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag && f(Seq::At(i));
+    }
+
+    return flag;
+}
+
 } // namespace ck
 #endif
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index 17813f09a4..679009680c 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -33,9 +33,18 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
+#if 1
     constexpr auto in_nchw_desc  = InDesc{};
     constexpr auto wei_kcyx_desc = WeiDesc{};
     constexpr auto out_nkhw_desc = OutDesc{};
+#else
+    constexpr auto in_nchw_desc =
+        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
+    constexpr auto wei_kcyx_desc =
+        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
+    constexpr auto out_nkhw_desc =
+        make_native_tensor_descriptor(OutDesc::GetLegnths(), OutDesc::GetStrides());
+#endif
 
     constexpr index_t N  = out_nkhw_desc.GetLength(I0);
     constexpr index_t K  = out_nkhw_desc.GetLength(I1);
@@ -88,7 +97,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
 
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
     // BlockSize = 64, each thread hold 64 data
     constexpr index_t BlockSize = 64;
 
@@ -125,6 +134,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
 
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 0
+    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 64;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmNRepeat = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 2;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 2;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<2, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif
 
     constexpr index_t N1 = GemmNRepeat;
diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
index 304fbe211d..974b89e0c9 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
 
 template <class T,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 596dada174..4890b424c0 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -19,7 +19,7 @@
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
 
 struct GeneratorTensor_1
 {
@@ -76,7 +76,7 @@ int main(int argc, char* argv[])
 
 #if 0
     constexpr index_t N  = 64;
-    constexpr index_t C  = 64;
+    constexpr index_t C  = 256;
     constexpr index_t HI = 56;
     constexpr index_t WI = 56;
     constexpr index_t K  = 256;