From 701b7341f0e3b903a561cf1f287633504fbeb0ae Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 7 Aug 2019 19:25:54 -0500
Subject: [PATCH] clean up

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  24 +---
 .../blockwise_generic_tensor_slice_copy.hpp   | 113 +++++-------------
 .../threadwise_generic_tensor_slice_copy.hpp  |  98 ---------------
 driver/src/driver.cpp                         |   2 +-
 4 files changed, 38 insertions(+), 199 deletions(-)
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 66fe01c71a..9747bf5e55 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -155,13 +155,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
         static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                       "GemmDataPerReadB alignment requirement is not satisfied");
 
-#if 0
+#if 1
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
         auto blockwise_in_copy =
             BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
                                                decltype(in_e_n1_b_n2_global_merged_desc),
                                                decltype(in_e_n1_b_n2_block_desc),
                                                decltype(in_e_n1_b_n2_block_desc.GetLengths()),
@@ -199,13 +198,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
-#if 0
+#if 1
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
         auto blockwise_wei_copy =
             BlockwiseGenericTensorSliceCopy_v1<BlockSize,
-                                               Float,
                                                decltype(wei_e_k_global_desc),
                                                decltype(wei_e_k_block_desc),
                                                decltype(wei_e_k_block_desc.GetLengths()),
@@ -324,7 +322,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
                 Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
 
-#if 0
+#if 1
                 blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
                 // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
                 // True);
@@ -356,7 +354,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
             Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
             Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
 
-#if 0
+#if 1
             blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
             // blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
             p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
@@ -436,19 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                     k_thread_data_on_global, 0, b_thread_data_on_global, 0);
 
-#if 0
-            ThreadwiseGenericTensorSliceCopy_v1r1<
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
-                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
-                arithmetic_sequence_gen<0, 8, 1>::type,
-                arithmetic_sequence_gen<0, 8, 1>::type,
-                7,
-                7,
-                1,
-                1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
-                .Run(p_out_thread, p_out_thread_on_global);
-#elif 0
+#if 1
             ThreadwiseGenericTensorSliceCopy_v1r2<
                 decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
                 decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index 7473194177..b20d2fd4d4 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -237,47 +237,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
                 thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
 #endif
 
-// By position the origin of the per-thread window at the point, where multi-index
-// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-// is assuming each thread is copy a noraml (not merged) tensor.
-// To satisfy this assumption, the user need to make sure that, on a merged dimension
-// that constains multiple original dimensions, the length of the last original
-// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-// the merged dimension need to be 1. These sanity checks are performed in constructor
-// of BlockwiseGenericTensorSliceCopy_v1
-#if 0
-            threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
-                                                    p_src + src_offset + mThreadSrcOffset,
-                                                    make_zero_array<index_t, nDim>(),
-                                                    thread_buffer_desc,
-                                                    p_buffer + buffer_offset,
-                                                    make_zero_array<index_t, nDim>(),
-                                                    thread_sub_tensor_lengths,
-                                                    SrcDimAccessOrder{},
-                                                    Number<SrcDataPerAccess>{});
-#elif 1
-            ThreadwiseGenericTensorSliceCopy_v1r1<
-                SrcDesc,
-                decltype(thread_buffer_desc),
-                SubLengths,
-                SrcDimAccessOrder,
-                typename arithmetic_sequence_gen<0, nDim, 1>::type,
-                SrcVectorAccessDim,
-                0,
-                SrcDataPerAccess,
-                1>(make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
+            // By position the origin of the per-thread window at the point, where multi-index
+            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
+            // is assuming each thread is copy a noraml (not merged) tensor.
+            // To satisfy this assumption, the user need to make sure that, on a merged dimension
+            // that constains multiple original dimensions, the length of the last original
+            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
+            // the merged dimension need to be 1. These sanity checks are performed in constructor
+            // of BlockwiseGenericTensorSliceCopy_v1
+            ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
+                                                  decltype(thread_buffer_desc),
+                                                  SubLengths,
+                                                  SrcDimAccessOrder,
+                                                  SrcVectorAccessDim,
+                                                  SrcDataPerAccess,
+                                                  1>(make_zero_array<index_t, nDim>(),
+                                                     make_zero_array<index_t, nDim>())
                 .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
-#elif 1
-        ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
-                                              decltype(thread_buffer_desc),
-                                              SubLengths,
-                                              SrcDimAccessOrder,
-                                              SrcVectorAccessDim,
-                                              SrcDataPerAccess,
-                                              1>(make_zero_array<index_t, nDim>(),
-                                                 make_zero_array<index_t, nDim>())
-            .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
-#endif
         });
     }
 
@@ -316,48 +292,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
             const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
 #endif
 
-// By position the origin of the per-thread window at the point, where multi-index
-// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-// is assuming each thread is copy a noraml (not merged) tensor.
-// To satisfy this assumption, the user need to make sure that, on a merged dimension
-// that constains multiple original dimensions, the length of the last original
-// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-// the merged dimension need to be 1. These sanity checks are performed in constructor
-// of BlockwiseGenericTensorSliceCopy_v1
-#if 0
-            threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
-                                                    p_buffer + buffer_offset,
-                                                    make_zero_array<index_t, nDim>(),
-                                                    DstDesc{},
-                                                    p_dst + dst_offset + mThreadDstOffset,
-                                                    make_zero_array<index_t, nDim>(),
-                                                    thread_sub_tensor_lengths,
-                                                    DstDimAccessOrder{},
-                                                    Number<DstDataPerAccess>{});
-#elif 1
-            ThreadwiseGenericTensorSliceCopy_v1r1<
-                decltype(thread_buffer_desc),
-                DstDesc,
-                SubLengths,
-                typename arithmetic_sequence_gen<0, nDim, 1>::type,
-                DstDimAccessOrder,
-                0,
-                DstVectorAccessDim,
-                1,
-                DstDataPerAccess>(make_zero_array<index_t, nDim>(),
-                                  make_zero_array<index_t, nDim>())
+            // By position the origin of the per-thread window at the point, where multi-index
+            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
+            // is assuming each thread is copy a noraml (not merged) tensor.
+            // To satisfy this assumption, the user need to make sure that, on a merged dimension
+            // that constains multiple original dimensions, the length of the last original
+            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
+            // the merged dimension need to be 1. These sanity checks are performed in constructor
+            // of BlockwiseGenericTensorSliceCopy_v1
+            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
+                                                  DstDesc,
+                                                  SubLengths,
+                                                  DstDimAccessOrder,
+                                                  DstVectorAccessDim,
+                                                  1,
+                                                  DstDataPerAccess>(
+                make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
                 .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
-#elif 1
-    ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
-                                          DstDesc,
-                                          SubLengths,
-                                          DstDimAccessOrder,
-                                          DstVectorAccessDim,
-                                          1,
-                                          DstDataPerAccess>(make_zero_array<index_t, nDim>(),
-                                                            make_zero_array<index_t, nDim>())
-        .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
-#endif
         });
     }
 
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 5354cdd61c..9ab18f4f3a 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -6,10 +6,6 @@
 #include "ConstantMergedTensorDescriptor.hpp"
 #include "tensor_coordinate.hpp"
 
-#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
-#endif
-
 #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #endif
@@ -24,100 +20,6 @@
 
 namespace ck {
 
-// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
-template <class Float,
-          class SrcDesc,
-          class DstDesc,
-          class SliceLengths,
-          class DimAccessOrder,
-          index_t DataPerAccess>
-__device__ void threadwise_generic_tensor_slice_copy_v1(
-    SrcDesc,
-    const Float* __restrict__ p_src,
-    Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
-    DstDesc,
-    Float* __restrict__ p_dst,
-    Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
-    SliceLengths,
-    DimAccessOrder,
-    Number<DataPerAccess>)
-{
-    constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
-                      nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
-                  "wrong! # of dimensions not the same");
-
-    static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
-
-    // TODO: do more sanity-check here, something like:
-    // constexpr auto src_strides_in_access_order =
-    //     SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
-
-    // constexpr auto dst_strides_in_access_order =
-    //     SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
-
-    // // check src/dst stride on the lowest access dimension
-    // static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
-    //                   (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
-    //               "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
-    //               "vectorized read/write");
-
-    constexpr auto slice_lengths_in_access_order =
-        SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
-
-    // check slice length on the lowest access dimension
-    static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
-                  "wrong! slice length on the lowest access dimension should be evenly divided by "
-                  "DataPerAccess");
-
-    constexpr index_t num_access_on_lowest_access_dimension =
-        slice_lengths_in_access_order.Back() / DataPerAccess;
-
-    constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
-        Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
-
-    using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
-    static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
-        constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
-
-        constexpr auto data_multi_id_in_access_order =
-            access_multi_id.Modify(Number<nDim - 1>{}, Number<itmp>{});
-
-        constexpr auto data_multi_id =
-            data_multi_id_in_access_order.ReorderGivenOld2New(DimAccessOrder{});
-
-        const index_t src_index =
-            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
-
-        const index_t dst_index =
-            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
-
-        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
-    });
-#else
-    ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
-        auto data_multi_id_in_access_order      = access_multi_id;
-        data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
-
-        const auto data_multi_id =
-            reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
-
-        const index_t src_index =
-            SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
-
-        const index_t dst_index =
-            DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
-
-        *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
-            *reinterpret_cast<const vector_t*>(&p_src[src_index]);
-    });
-#endif
-}
-
 // This threadwise copy allow vector access of src and dst.
 // It allows the dimensions of vector access to be different on src and dst.
 // It also allows the vector size to be different on src and dst.
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index ea9f29b2e5..2f9a2c3e76 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
 #elif 0
     device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
         (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
+#elif 1
     device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                          in_nchw,
                                                          wei_kcyx_desc,