From 14315b72f33deeab492278476a88103ae78ff3cb Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 27 Sep 2019 15:24:27 -0500
Subject: [PATCH] tweaking

---
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp |  2 +-
 .../tensor_description/tensor_coordinate.hpp  | 22 ++++++++++--
 .../tensor_coordinate_deprecated.hpp          |  4 +--
 .../threadwise_generic_tensor_slice_copy.hpp  | 34 ++++++++++++++++---
 driver/src/driver.cpp                         | 18 +++++-----
 5 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index faf8764507..1fc948bfe8 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                       0,
                                                       b_thread_data_on_global,
                                                       0})
-#if 1
+#if 0
                 .template Run<Float, Float, address_space_t::generic, address_space_t::global>
 #else // tweaking
                 .template Run_optimized_dst_address_calculation<Float,
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 5114b2ce99..dfcce194d1 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -78,6 +78,11 @@ struct NativeTensorCoordinate
         return coord;
     }
 
+    __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
+    {
+        return tensor_desc_type::CalculateOffsetDiff(idx_diff);
+    }
+
     __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }
 
     private:
@@ -170,7 +175,18 @@ struct TransformedTensorCoordinate
         return coord_up;
     }
 
-    // this function should be inexpensive, because there is no upper-to-lower index transformation
+    // Calculate offset diff without updating tensor-coordinate
+    // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
+    //   then all calculation can be done at compile-time.
+    __host__ __device__ constexpr index_t CalculateOffsetDiff(const UpperIndex& idx_up_diff) const
+    {
+        // For transformation of multi-index difference, not all transformation functions need to
+        //   know the old lower-index or the old upper-index. We pass both of them to the
+        //   transformation function. The transformation function itself decides to use them or not.
+        return GetLowerCoordinate().CalculateOffsetDiff(tensor_desc_type::CalculateLowerIndexDiff(
+            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()));
+    }
+
     __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
     {
         return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
@@ -193,7 +209,7 @@ struct TensorCoordinate
     private:
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
     {
         return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -201,7 +217,7 @@ struct TensorCoordinate
 
     template <typename... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
     {
         return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
             make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
index 46e551ddd4..db48b1a906 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
     private:
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
     {
         return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
     }
 
     template <class... Ts>
     __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
     {
         return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
     }
diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
index ae50d65c3e..c21cf737c1 100644
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -226,6 +226,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
         constexpr auto src_linear_dim_mask    = SrcDesc::GetLinearDimensionMask();
         constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();
 
+#if 0 // debug
+        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+        {
+            print_sequence("src_linear_dim_mask", src_linear_dim_mask);
+            print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
+        }
+#endif
+
         static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
                           long_vector_size == SrcDataPerAccess,
                       "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
@@ -292,9 +300,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                     // TODO: is this good implementation?
                     const index_t src_linear_offset =
                         src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
-#else
+#elif 0
                     const index_t src_linear_offset =
-                        SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
+                        SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
+                        SrcDesc::CalculateOffset(make_zero_array<index_t, nDim>());
+#elif 1
+                    const index_t src_linear_offset =
+                        src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #endif
 
                     // Check src vector's padding situation, only check the first data in
@@ -384,6 +396,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
         constexpr auto dst_linear_dim_mask    = DstDesc::GetLinearDimensionMask();
         constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();
 
+#if 0 // debug
+        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
+        {
+            print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
+            print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
+        }
+#endif
+
         static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
                           long_vector_size == DstDataPerAccess,
                       "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
@@ -477,13 +497,17 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                         dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
 
 // this is dst compile-time offset
-#if 1
+#if 0
                     // TODO: is this good implementation?
                     const index_t dst_linear_offset =
                         dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
-#else
+#elif 0
                     const index_t dst_linear_offset =
-                        DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
+                        DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
+                        DstDesc::CalculateOffset(make_zero_array<index_t, nDim>());
+#elif 1
+                    const index_t dst_linear_offset =
+                        dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #endif
 
                     // Check dst vector's padding situation, only check the first data in
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index 9fe0fb5dbc..85298074b7 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -74,20 +74,20 @@ int main(int argc, char* argv[])
 {
     using namespace ck;
 
-#if 0
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
-    constexpr index_t K  = 256;
+#if 1
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
     constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
+    constexpr index_t X  = 7;
 
     using ConvStrides   = Sequence<1, 1>;
     using ConvDilations = Sequence<1, 1>;
 
-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads  = Sequence<0, 3>;
+    using RightPads = Sequence<0, 3>;
 #elif 0
     // 3x3, 34x34
     constexpr index_t N  = 64;