From 51884fc21412b1800bb85b28c0f5a0b651d23cef Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sat, 21 Sep 2019 22:53:03 -0500
Subject: [PATCH] WIP: explicitly separate offset component into compile-time,
 block-invariant and per-thread components

---
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp |  15 ++-
 .../multi_index_transform.hpp                 |  44 +++++++
 .../tensor_description/tensor_descriptor.hpp  | 120 +++++++++++-------
 .../tensor_descriptor_helper.hpp              |  13 ++
 .../blockwise_generic_tensor_slice_copy.hpp   |  48 ++++---
 .../threadwise_generic_tensor_slice_copy.hpp  | 115 +++++++++++++++++
 composable_kernel/include/utility/array.hpp   |  14 ++
 .../include/utility/sequence.hpp              |  12 +-
 composable_kernel/include/utility/tuple.hpp   |  20 ++-
 ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp |   2 +-
 driver/src/driver.cpp                         |  10 +-
 11 files changed, 337 insertions(+), 76 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index 0a5b4a3c34..2a934fb9cb 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -3,7 +3,8 @@
 
 #include "common_header.hpp"
 #include "ConstantTensorDescriptor.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(in_e_n1_b_n2_global_desc),
                                                decltype(in_e_n1_b_n2_block_desc),
+                                               Sequence<0, 1, 0, 1>,
+                                               Sequence<1, 0, 1, 0>,
+                                               Sequence<1, 1, 1, 1>,
+                                               Sequence<0, 0, 0, 0>,
                                                decltype(in_e_n1_b_n2_block_desc.GetLengths()),
                                                InBlockCopySubLengths_E_N1_B_N2,
                                                InBlockCopyClusterLengths_E_N1_B_N2,
@@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
             BlockwiseGenericTensorSliceCopy_v4<BlockSize,
                                                decltype(wei_e_k_global_desc),
                                                decltype(wei_e_k_block_desc),
+                                               Sequence<1, 1>,
+                                               Sequence<0, 0>,
+                                               Sequence<1, 1>,
+                                               Sequence<0, 0>,
                                                decltype(wei_e_k_block_desc.GetLengths()),
                                                WeiBlockCopySubLengths_E_K,
                                                WeiBlockCopyClusterLengths_E_K,
@@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
 
             ThreadwiseGenericTensorSliceCopy_v4r2<decltype(out_k0_k1_n1_b_n2_thread_desc),
                                                   decltype(out_k0_k1_n1_b_n2_global_desc),
+                                                  Sequence<1, 1, 1, 1, 1>,
+                                                  Sequence<0, 0, 0, 0, 0>,
+                                                  Sequence<1, 1, 1, 0, 1>,
+                                                  Sequence<0, 0, 0, 1, 0>,
                                                   decltype(
                                                       out_k0_k1_n1_b_n2_thread_desc.GetLengths()),
                                                   arithmetic_sequence_gen<0, 5, 1>::type,
diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp
index bf56678b63..47f2d97089 100644
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -368,5 +368,49 @@ struct Embed
     }
 };
 
+template <index_t LowerLength, index_t VectorSize>
+struct Vectorize
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    __host__ __device__ constexpr Vectorize()
+    {
+        static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return Sequence<LowerLength / VectorSize>{};
+    }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return VectorSize * idx_up;
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return VectorSize * idx_up_diff;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    // TODO: should this function be here? should it be specific for padding check?
+    __host__ __device__ static constexpr bool
+    IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */)
+    {
+        return false;
+    }
+};
+
 } // namespace ck
 #endif
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index 4d0acd7993..f307286f70 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -101,17 +101,24 @@ struct NativeTensorDescriptor
         return true;
     }
 
-    __host__ __device__ static constexpr auto GetLinearDimensions()
+    __host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
     {
-        return typename arithmetic_sequence_gen<0, nDim, 1>::type{};
+        return typename uniform_sequence_gen<nDim, 1>::type{};
+    }
+
+    __host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
+    {
+        return typename uniform_sequence_gen<nDim, 0>::type{};
     }
 
     __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; }
 
+#if 0
     __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
     {
         return Tuple<>{};
     }
+#endif
 
     // TODO: should this function be here? should it be specific for padding check?
     __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */)
@@ -233,7 +240,7 @@ struct TransformedTensorDescriptor
     __host__ __device__ static constexpr auto GetUpperLengths()
     {
         constexpr auto tuple_of_up_lengths =
-            transform_tuple(lambda_GetUpperLengths{}, Transforms{});
+            transform_tuples(lambda_GetUpperLengths{}, Transforms{});
 
         constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths);
 
@@ -346,67 +353,92 @@ struct TransformedTensorDescriptor
         return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
     }
 
-#if 1
+#if 0
     struct lambda_sequence_logic_or
     {
         template <typename... Seqs>
         __host__ __device__ constexpr auto operator()(Seqs... seqs) const
         {
             // TODO: should use math::logic_or<bool>, after Sequence can take bool
-            return typename sequence_reduce<math::logic_or<index_t>, Seqs...>::type{};
+            return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
         }
     };
 
     struct lambda_1
     {
-        template <typename Transform>
-        __host__ __device__ constexpr auto operator()(const Transform& tran) const
+        // check only one transform at a time
+        template <typename Transform, typename LowDimensionId, typename UpDimensionId>
+        __host__ __device__ constexpr auto
+        operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
         {
-            return tran.GetUpperLengths();
+            // judge if transformation is linear
+            constexpr bool is_linear_transform = tran.IsLinearTransform();
+
+            // judge if all lower dimension are linear
+            constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence(
+                pick_sequence_elements_by_mask(
+                    GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
+                math::logic_and<bool>{},
+                integral_constant<bool, true>{});
+
+            // judge if upper dimenisons are linear
+            constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
+
+            constexpr auto value_sequence =
+                typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
+                                              is_up_dim_nonlinear>::type{};
+
+            constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
+                typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
+
+            return mask_of_up_nonlinear_dims;
+        };
+
+        __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
+        {
+            // create tuple of linear dimension masks, for all transformations
+            constexpr auto tuple_of_nonlinear_dimension_mask =
+                transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
+
+            // reduce tuple of masks into one mask
+            constexpr auto nonlinear_dimension_mask =
+                unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
+
+            return nonlinear_dimension_mask;
         }
-    };
 
-    template <index_t IDim>
-    __host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
-    {
-        // create tuple of linear dimension masks, for all transformations
-        constexpr auto tuple_of_linear_dimension_mask = 
-            transform_tuple(lambda_1, Transforms{});
+        __host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
+        {
+            return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
+        }
 
-        // reduce tuple of masks into one mask
-        constexpr auto linear_dimension_mask =
-            unpack(lambda_sequence_logic_or{}, tuple_of_linear_dimension_mask);
+        template <index_t IDim>
+        __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
+        {
+            return GetMaskOfLinearDimensions().At(Number<IDim>{});
+        }
 
-        return linear_dimension_mask;
-    }
+        __host__ __device__ static constexpr auto GetLinearDimensions()
+        {
+            constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
 
-    template <index_t IDim>
-    __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
-    {
-        return GetMaskOfLinearDimensions().At(Number<IDim>{});
-    }
+            return pick_sequence_elements_by_mask(
+                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
+        }
 
-    __host__ __device__ static constexpr auto GetLinearDimensions()
-    {
-        constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
+        __host__ __device__ static constexpr auto GetNonLinearDimensions()
+        {
+            constexpr auto nonlinear_dimension_mask =
+                GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
 
-        return pick_sequence_elements_by_mask(
-            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
-    }
+            return pick_sequence_elements_by_mask(
+                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
+        }
 
-    __host__ __device__ static constexpr auto GetNonLinearDimensions()
-    {
-        constexpr auto nonlinear_dimension_mask =
-            GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
-
-        return pick_sequence_elements_by_mask(
-            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
-    }
-
-    __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
-    {
-        // not implemented
-    }
+        __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
+        {
+            // not implemented
+        }
 #endif
 
     // TODO: should this function be here? should it be specific for padding check?
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
index 52dc1642e6..12517071ec 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
@@ -96,6 +96,19 @@ __host__ __device__ constexpr auto
         LowerTensorDescriptor{}, typename sequence_map_inverse<MapUpper2Lower>::type{});
 }
 
+template <typename LowerTensorDescriptor, index_t VectorDim, index_t VectorSize>
+__host__ __device__ constexpr auto
+vectorize_tensor_descriptor(LowerTensorDescriptor, Number<VectorDim> vector_dim, Number<VectorSize>)
+{
+    constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
+
+    return transform_tensor_descriptor(
+        LowerTensorDescriptor{},
+        Vectorize<LowerTensorDescriptor::GetLength(vector_dim), VectorSize>{},
+        typename arithmetic_sequence_gen<0, nDim, 1>::type{},
+        typename arithmetic_sequence_gen<0, nDim, 1>::type{});
+}
+
 template <typename... NativeDimensions>
 __host__ __device__ void
 print_tensor_descriptor(const char* s, const NativeTensorDescriptor<NativeDimensions...>& desc)
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 25cd5a819c..25349dc9f9 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3
 template <index_t BlockSize,
           typename SrcDesc,
           typename DstDesc,
+          typename SrcLinearDimensionMask,
+          typename SrcNonLinearDimensionMask,
+          typename DstLinearDimensionMask,
+          typename DstNonLinearDimensionMask,
           typename SliceLengths,
           typename SubLengths,
           typename ThreadClusterLengths,
@@ -739,7 +743,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
     {
 #if 0
         mThreadwiseLoad.Run(p_src, p_buffer);
-#else
+#elif 1
+        mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
+#elif 0
         // hardcoded: global to register
         mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
 #endif
@@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
     {
 #if 0
         mThreadwiseStore.Run(p_buffer, p_dst);
-#else
+#elif 1
         // hardcoded: register to LDS
         mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
 #endif
@@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4
     private:
     using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{}));
 
-    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
-                                                                 RegisterBufferDesc,
-                                                                 SubLengths,
-                                                                 SrcDimAccessOrder,
-                                                                 SrcVectorAccessDim,
-                                                                 SrcDataPerAccess,
-                                                                 1>;
+    using ThreadwiseLoad =
+        ThreadwiseGenericTensorSliceCopy_v4r2<SrcDesc,
+                                              RegisterBufferDesc,
+                                              SrcLinearDimensionMask,
+                                              SrcNonLinearDimensionMask,
+                                              typename uniform_sequence_gen<nDim, 1>::type,
+                                              typename uniform_sequence_gen<nDim, 0>::type,
+                                              SubLengths,
+                                              SrcDimAccessOrder,
+                                              SrcVectorAccessDim,
+                                              SrcDataPerAccess,
+                                              1>;
 
-    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
-                                                                  DstDesc,
-                                                                  SubLengths,
-                                                                  DstDimAccessOrder,
-                                                                  DstVectorAccessDim,
-                                                                  1,
-                                                                  DstDataPerAccess>;
+    using ThreadwiseStore =
+        ThreadwiseGenericTensorSliceCopy_v4r2<RegisterBufferDesc,
+                                              DstDesc,
+                                              typename uniform_sequence_gen<nDim, 1>::type,
+                                              typename uniform_sequence_gen<nDim, 0>::type,
+                                              DstLinearDimensionMask,
+                                              DstNonLinearDimensionMask,
+                                              SubLengths,
+                                              DstDimAccessOrder,
+                                              DstVectorAccessDim,
+                                              1,
+                                              DstDataPerAccess>;
 
     ThreadwiseLoad mThreadwiseLoad;
     ThreadwiseStore mThreadwiseStore;
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 67b0969a0e..d8ecf6508b 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
 // the other is device memory or LDS
 template <typename SrcDesc,
           typename DstDesc,
+          typename SrcLinearDimensionMask,
+          typename SrcNonLinearDimensionMask,
+          typename DstLinearDimensionMask,
+          typename DstNonLinearDimensionMask,
           typename SliceLengths,
           typename DimAccessOrder,
           index_t VectorAccessDim,
@@ -1254,6 +1258,117 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
         });
     }
 
+    template <index_t... Lengths, index_t... Mask>
+    __device__ static constexpr auto mask_lengths(Sequence<Lengths...>, Sequence<Mask...>)
+    {
+        return Sequence<(Mask ? Lengths : 1)...>{};
+    }
+
+    template <class TData>
+    __device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src,
+                                                                            TData* p_dst) const
+    {
+        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
+        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
+
+        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
+
+        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
+        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
+
+        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
+
+        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
+            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
+
+        // TODO:: don't use hack
+        constexpr auto src_linear_dim_mask    = SrcLinearDimensionMask{};
+        constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{};
+
+        // separate steps into linear and non-linear components
+        constexpr auto linear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, src_linear_dim_mask);
+
+        constexpr auto nonlinear_long_vector_access_lengths =
+            mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask);
+
+        // loop over src's non-linear dimensions
+        ford<decltype(nonlinear_long_vector_access_lengths)>{}(
+            [&](auto nonlinear_dim_long_vector_access_id) {
+
+                // step-sizes along src's nonlinear dimensions
+                auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id;
+                nonlinear_dim_data_steps(vector_access_dim) =
+                    long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim];
+
+                // move src cooridnate along nonlinear dimensions
+                const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps;
+
+                // loop over src's linear dimensions
+                ford<decltype(linear_long_vector_access_lengths)>{}(
+                    [&](auto linear_dim_long_vector_access_id) {
+
+                        // step-sizes along src's linear dimensions
+                        auto linear_dim_data_steps = linear_dim_long_vector_access_id;
+                        linear_dim_data_steps(vector_access_dim) =
+                            long_vector_size * linear_dim_long_vector_access_id[vector_access_dim];
+
+                        // buffer to hold a long-vector
+                        TData p_long_vector[long_vector_size];
+
+                        // set 0
+                        for(index_t i = 0; i < long_vector_size; ++i)
+                        {
+                            p_long_vector[i] = 0;
+                        }
+
+                        // load data from src to the long-vector buffer
+                        for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
+                        {
+                            auto scalar_id               = make_zero_array<index_t, nDim>();
+                            scalar_id(vector_access_dim) = i * src_data_per_access;
+
+                            // move src cooridnate along linear dimensions
+                            const auto src_coord =
+                                src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
+
+                            // TODO: good implementation?
+                            const index_t src_linear_offset_diff =
+                                src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
+
+                            // check for padding
+                            // TODO: still kind of messy
+                            if(!src_coord.IsAnyLevelIndexInPaddingArea())
+                            {
+                                const index_t src_offset = src_coord.GetOffset();
+
+                                const index_t buffer_offset = i * src_data_per_access;
+
+                                *reinterpret_cast<src_vector_t*>(&p_long_vector[buffer_offset]) =
+                                    *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
+                            }
+                        }
+
+                        // store data from the long-vector buffer to dst
+                        for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
+                        {
+                            auto scalar_id               = make_zero_array<index_t, nDim>();
+                            scalar_id(vector_access_dim) = i * dst_data_per_access;
+
+                            const index_t buffer_offset = i * dst_data_per_access;
+
+                            const index_t dst_offset =
+                                (mDstSliceOrigin +
+                                 (nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id))
+                                    .GetOffset();
+
+                            *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
+                                *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
+                        }
+                    });
+            });
+    }
+
     // memory-space
     // 0: VGPR
     // 1: LDS
diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp
index 7b0c25d41b..b0ffa86785 100644
--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
@@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
     return result;
 }
 
+// Array = Array * TData
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto operator*(TData v, Array<TData, NSize> a)
+{
+    Array<TData, NSize> result;
+
+    for(index_t i = 0; i < NSize; ++i)
+    {
+        result(i) = a[i] * v;
+    }
+
+    return result;
+}
+
 template <typename TData, index_t NSize, typename Reduce>
 __host__ __device__ constexpr TData
 accumulate_on_array(const Array<TData, NSize>& a, Reduce f, TData init)
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
index c351754140..55bda8aede 100644
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq)
     return sequence_pop_front(Seq::Reverse()).Reverse();
 }
 
-template <typename F, index_t... Xs>
-__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
-{
-    return Sequence<f(Xs)...>{};
-}
-
 template <typename... Seqs>
 __host__ __device__ constexpr auto merge_sequences(Seqs...)
 {
     return typename sequence_merge<Seqs...>::type{};
 }
 
+template <typename F, index_t... Xs>
+__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
+{
+    return Sequence<f(Xs)...>{};
+}
+
 template <typename F, index_t... Xs, index_t... Ys>
 __host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
 {
diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp
index 27175fe625..815ca8466e 100644
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs)
 namespace detail {
 
 template <typename F, typename X, index_t... Is>
-__host__ __device__ constexpr auto transform_tuple_impl(F f, const X& x, Sequence<Is...>)
+__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
 {
     return make_tuple(f(x.At(Number<Is>{}))...);
 }
 
+template <typename F, typename X, typename Y, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
+}
+
 } // namespace detail
 
 template <typename F, typename X>
-__host__ __device__ constexpr auto transform_tuple(F f, const X& x)
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
 {
-    return detail::transform_tuple_impl(
+    return detail::transform_tuples_impl(
         f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
 }
 
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
 } // namespace ck
 #endif
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
index ac95e09d7e..17813f09a4 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
+//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
 
 template <typename T,
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index e9fa7ba38a..596dada174 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -9,17 +9,17 @@
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
+//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
+//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
 //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
+//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
+//#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
 
 struct GeneratorTensor_1
 {