From 2a48812edb1a7c3e280159637fa89b7a0bbfb86b Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 21 May 2019 16:43:56 -0500
Subject: [PATCH] behavior has changed (better and worse), figuring out why

---
 ...e_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp |  4 ++--
 src/include/ConstantTensorDescriptor.hip.hpp         |  7 +++----
 src/include/blockwise_tensor_slice_op.hip.hpp        | 12 ++++++++----
 ...emm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp |  4 ++--
 ...emm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp |  2 +-
 5 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
index 6a005c5dd6..7ab9fd7c75 100644
--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
-#if 1
+#if 0
     // for 3x3, 34x34, v1r3, Pascal
     constexpr index_t BlockSize = 128;
 
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
     constexpr index_t WeiBlockCopyDataPerRead_K = 4;
 
     constexpr index_t OutThreadCopyDataPerWrite_W = 2;
-#elif 0
+#elif 1
     // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
     constexpr index_t BlockSize = 256;
 
diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp
index 880ea5038f..ff422de6e5 100644
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor
                           "wrong! dimensions to be unfolded need to be packed");
 
             // checkt ranks
-            static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1,
-                          "wrong! ranks of dimensions to be "
-                          "unfolded need to be in increasing "
-                          "and continuous ranks");
+            static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
+                          "wrong! ranks of dimensions to be unfolded need to be in increasing and "
+                          "continuous ranks");
         });
 
         // left and right
diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp
index 6a32754a28..0285528648 100644
--- a/src/include/blockwise_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_tensor_slice_op.hip.hpp
@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
         constexpr auto thread_cluster_lengths =
             src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
 
-        constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
+        constexpr auto thread_cluster_desc =
+            make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
 
         // sanity check: data type
         static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
 
         constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
-        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
 
         return thread_tensor_desc.GetElementSpace();
     }
@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
 
         constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
-        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
 
         constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
 
-        constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
+        constexpr auto thread_tensor_desc =
+            make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
 
         static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
             constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
index 7e1f08c42f..f721b92af1 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
         const index_t n_thread_data_begin  = c_thread_mtx_begin.col % NPerBlock;
 
         static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
-                                                                         // perfect forwarding.
-                                                                         // Using this trick to
+                                                                     // perfect forwarding.
+                                                                     // Using this trick to
             // make this lambda a generic lambda, so it won't be compiled until
             // instantiated
             static_assert(
diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
index b8689b9257..8549f30a01 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
 
         // choose GEMM implementation here
         const auto run_blockwise_batch_gemm = [&](auto... Xs) {
-#if 1
+#if 0
             return blockwise_batch_gemm.Run(Xs...);
 #elif 0
             return blockwise_batch_gemm.Run_asm(Xs...);