From 5636576f9b297b6645677aae16d02a2625a8ff01 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 7 Aug 2019 18:27:10 -0500
Subject: [PATCH] bug fix in ford, forgot to reorder lengths

---
 ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp |  2 +-
 .../blockwise_generic_tensor_slice_copy.hpp   | 13 ++++-----
 .../threadwise_generic_tensor_slice_copy.hpp  | 27 +++++++------------
 .../include/utility/config_amd.hpp.in         |  1 +
 .../include/utility/config_nvidia.hpp.in      |  1 +
 .../include/utility/functional3.hpp           |  6 +++--
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |  2 +-
 7 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
index 555e68e594..938d203755 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
                 7,
                 1,
                 1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
-                .Run_non_static(p_out_thread, p_out_thread_on_global);
+                .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
             ThreadwiseGenericTensorSliceCopy_v2<
                 decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
index d2161cb4ff..8b82c3978b 100644
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                                               SrcDataPerAccess,
                                               1>(make_zero_array<index_t, nDim>(),
                                                  make_zero_array<index_t, nDim>())
-            .Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
+            .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
 #endif
         });
     }
@@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1
 // By position the origin of the per-thread window at the point, where multi-index
 // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
 // is assuming each thread is copy a noraml (not merged) tensor.
-// User need to guarantee this is true.
-// By setting SubLengths = 1 at the merged dimension, this is always true;
-// If in the future, you want to enable SubLengths > 1 at the merged dimension,
-// special care in implementation is needed
+// To satisfy this assumption, the user need to make sure that, on a merged dimension
+// that constains multiple original dimensions, the length of the last original
+// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
+// the merged dimension need to be 1. These sanity checks are performed in constructor
+// of BlockwiseGenericTensorSliceCopy_v1
 #if 0
             threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
                                                     p_buffer + buffer_offset,
@@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                                           1,
                                           DstDataPerAccess>(make_zero_array<index_t, nDim>(),
                                                             make_zero_array<index_t, nDim>())
-        .Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
+        .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
 #endif
         });
     }
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index 31de9f2fa0..3fd9b0d37a 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -10,6 +10,10 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
 #endif
 
+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
+#endif
+
 namespace ck {
 
 // user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
@@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
         constexpr auto long_vector_access_lengths = SliceLengths::Modify(
             vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
 
+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
         static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
             auto long_vector_access_id) {
+
             // data id w.r.t slicing-window
             constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
                 vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
@@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
                     *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
             });
         });
-    }
-
-    template <class TData>
-    __device__ void Run_non_static(const TData* p_src, TData* p_dst) const
-    {
-        using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
-        using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
-
-        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
-
-        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
-        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
-
-        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
-
-        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
-            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
-
+#else
         ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
             [&](auto long_vector_access_id) {
+
                 // data id w.r.t slicing-window
                 auto long_vector_data_begin_id = long_vector_access_id;
                 long_vector_data_begin_id(vector_access_dim) =
@@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
                         *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
                 }
             });
+#endif
     }
 
     private:
diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in
index e11ac1b70e..c0484a1f18 100644
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -8,6 +8,7 @@
 #define CK_USE_AMD_INLINE_ASM 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 
 namespace ck {
 
diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in
index 673c2778b1..2287868410 100644
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -10,6 +10,7 @@
 #define CK_USE_AMD_INLINE_ASM 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 
 namespace ck {
 
diff --git a/composable_kernel/include/utility/functional3.hpp b/composable_kernel/include/utility/functional3.hpp
index 7bd2b8d0cd..f1c21d7f59 100644
--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
@@ -135,9 +135,11 @@ struct ford
     template <class F>
     __host__ __device__ constexpr void operator()(F f) const
     {
-        for(index_t i = 0; i < Lengths::Front(); ++i)
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+
+        for(index_t i = 0; i < ordered_lengths.Front(); ++i)
         {
-            ford_impl<decltype(Lengths::PopFront()), Orders>{}(f, Array<index_t, 1>{i});
+            ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f, Array<index_t, 1>{i});
         }
     }
 };
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index f5efc83fdf..58827649a8 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
 
     constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
 
-#if 0
+#if 1
     // each thread hold 64 data
     constexpr index_t BlockSize = 256;