From a9a392b44dd53a4c29561b601c4ac194880ab45a Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 1 Aug 2019 15:32:40 -0500
Subject: [PATCH] experimenting TensorCoordinate and new merged tensor copy
 operator

---
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |  45 ++++++
 ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp |  34 -----
 .../tensor_description/tensor_coordinate.hpp  | 142 ++++++------------
 ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp |   2 +-
 ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 104 ++++++++-----
 driver/src/driver.cpp                         |   2 +-
 6 files changed, 158 insertions(+), 171 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index fecd7c5ca1..46230c8cb7 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
         static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                       "GemmDataPerReadB alignment requirement is not satisfied");
 
+#if 0 // debug
         // input blockwise copy
         //     slice a merged tensor, reorder and copy to a normal tensor
         //     this copy operator already has blockwise offset built-in
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                                                InBlockCopySrcDataPerRead_B,
                                                InBlockCopyDstDataPerWrite_N2>(
                 {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#else
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v2<
+            BlockSize,
+            Float,
+            decltype(in_e_n1_b_n2_global_merged_desc),
+            decltype(in_e_n1_b_n2_block_desc),
+            MergedTensorCoordinate<decltype(in_e_n1_b_n2_global_merged_desc)>,
+            NormalTensorCoordinate<decltype(in_e_n1_b_n2_block_desc)>,
+            decltype(in_e_n1_b_n2_block_desc.GetLengths()),
+            InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopyThreadClusterArrangeOrder>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#endif
 
         // weight tensor
         //     tensor descriptor in device memory, src of blockwise copy
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
             Sequence<EPerBlock, KPerBlock>{},
             Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
 
+#if 0 // debug
         // operator for blockwise copy of weight into LDS
         //     slice a tensor, and copy it into another tensor
         //     this copy operator already have blockwise offset built-in
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                                                WeiBlockCopySrcDataPerRead_E,
                                                WeiBlockCopyDstDataPerWrite_K>(
                 {0, k_block_data_on_global}, {0, 0});
+#else
+        auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v2<
+            BlockSize,
+            Float,
+            decltype(wei_e_k_global_desc),
+            decltype(wei_e_k_block_desc),
+            NormalTensorCoordinate<decltype(wei_e_k_global_desc)>,
+            NormalTensorCoordinate<decltype(wei_e_k_block_desc)>,
+            decltype(wei_e_k_block_desc.GetLengths()),
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder>({0, k_block_data_on_global}, {0, 0});
+#endif
 
         // GEMM definition
         // c_mtx += transpose(a_mtx) * b_mtx
@@ -277,8 +305,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 
             __syncthreads();
 
+#if 0
             blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
             blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
+#else
+            blockwise_in_copy.MoveSrcSlicingWindow({EPerBlock, 0, 0, 0}, true);
+            blockwise_wei_copy.MoveSrcSlicingWindow({EPerBlock, 0}, true);
+#endif
         }
 
         // copy output: register to global memory
@@ -328,6 +361,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                 out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
                     k_thread_data_on_global, 0, b_thread_data_on_global, 0);
 
+#if 0 // debug
             threadwise_generic_tensor_slice_copy_v1(
                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
                 p_out_thread,
@@ -338,6 +372,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
                 out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
                 arithmetic_sequence_gen<0, 8, 1>::type{},
                 Number<1>{});
+#else
+            ThreadwiseGenericTensorSliceCopy_v2<
+                Float,
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
+                NormalTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc)>,
+                MergedTensorCoordinate<decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc)>,
+                decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>(
+                {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
+                .Run(p_out_thread, p_out_thread_on_global);
+#endif
         }
     }
 };
diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 2fd7c9c6bd..918f71073d 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -301,40 +301,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
                                                         b_thread_data_on_global % B1});
 
             threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
-#elif 0
-            // This is a hack, because slicing a merged dimension is not supported yet.
-            // This should be replaced with logic above, once slicing a merged dimension support
-            // become available
-            //     dst descriptor
-            constexpr auto out_k0_k1_b_global_desc =
-                make_ConstantMergedTensorDescriptor(out_n_k_h_w_global_desc.Fold(I1, Number<K1>{}),
-                                                    Sequence<1>{},
-                                                    Sequence<2>{},
-                                                    Sequence<0, 3, 4>{});
-
-            //     src descriptor
-            constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed(
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat * GemmNPerThreadSubC>{});
-
-            auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2<
-                Float,
-                decltype(out_k0_k1_b_thread_desc),
-                decltype(out_k0_k1_b_global_desc),
-                NormalTensorCoordinate<decltype(out_k0_k1_b_thread_desc)>,
-                MergedTensorCoordinate<decltype(out_k0_k1_b_global_desc)>,
-                Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNPerThreadSubC>>(
-                {0, 0, 0},
-                {k_thread_data_on_global / K1,
-                 k_thread_data_on_global % K1,
-                 b_thread_data_on_global});
-
-            for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat)
-            {
-                threadwise_out_copy.Run(p_out_thread, p_out_global);
-
-                threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true);
-                threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true);
-            }
 #elif 1
             // This is a hack, because slicing a merged dimension is not supported yet.
             // This should be replaced with logic above, once slicing a merged dimension support
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
index 98de85bf6a..709beef171 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -16,7 +16,7 @@ struct NormalTensorCoordinate
     static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
 
     __host__ __device__ constexpr NormalTensorCoordinate(Array<index_t, nDim> tensor_index)
-        : mIndex{tensor_index}, mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
+        : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
     {
     }
 
@@ -26,38 +26,15 @@ struct NormalTensorCoordinate
     {
     }
 
-    __host__ __device__ constexpr Array<unsigned, nDim> GetIndex() const { return mIndex; }
-
     __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
 
-    template <class IDim, bool PositiveDirection>
-    __host__ __device__ void
-    MoveOnDimension(IDim idim, index_t step_size, integral_constant<bool, PositiveDirection>)
-    {
-        if(PositiveDirection)
-        {
-            mIndex(idim) += step_size;
-            mOffset += step_size * tensor_desc_type::GetStride(idim);
-        }
-        else
-        {
-            mIndex(idim) -= step_size;
-            mOffset -= step_size * tensor_desc_type::GetStride(idim);
-        }
-    }
-
     // T is Array or Sequence
     template <class T>
     __host__ __device__ type operator+=(T step_sizes)
     {
-#if 0
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
 
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
-        });
+        mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
 
         return *this;
     }
@@ -65,14 +42,9 @@ struct NormalTensorCoordinate
     template <class T>
     __host__ __device__ type operator-=(T step_sizes)
     {
-#if 0
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
 
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
-        });
+        mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
 
         return *this;
     }
@@ -93,19 +65,25 @@ struct NormalTensorCoordinate
         return coord;
     }
 
-    // reposition point of origin, and return compensated offset
+    // reposition point of origin, and return compensated offset.
+    // This is a hack to reduce index calculation during looping over
+    // a tensor whose origin is this TensorCoordinate. It does so, by spitting
+    // out the run-time offset to the pointer (to the tensor data) held by this
+    // TensorCoordiante, so the caller can add the offset into the run-time pointer of
+    // the data, so only 1 run-time variable (update pointer) is needed, instead
+    // of 2 run-time variables (old pointer and this offset)
+    // TODO: after introducing the concept of "run-time tensor view", which contains the
+    // run-time pointer to the data, always keep track of the pointer, instead of both
+    // offset and the pointer. This also bring additional benefit that we don't need to
+    // worry the offset might underflow (because offset is unsigned integer) when updating it.
     __host__ __device__ constexpr index_t RepositionOrigin()
     {
         index_t offset_diff = mOffset;
-
-        mIndex  = make_zero_array<index_t, nDim>();
-        mOffset = 0;
-
+        mOffset             = 0;
         return offset_diff;
     }
 
-    // private:
-    Array<index_t, nDim> mIndex;
+    private:
     index_t mOffset;
 };
 
@@ -120,8 +98,7 @@ struct MergedTensorCoordinate
         tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
 
     __host__ __device__ constexpr MergedTensorCoordinate(Array<index_t, nDim> tensor_index)
-        : mIndex{tensor_index},
-          mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
+        : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
     {
         // partial offset on each dimension
         static_for<0, nDim, 1>{}([&](auto idim) {
@@ -146,8 +123,6 @@ struct MergedTensorCoordinate
     {
     }
 
-    __host__ __device__ constexpr Array<index_t, nDim> GetIndex() const { return mIndex; }
-
     __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
 
     // step_size should be known at compile time
@@ -157,17 +132,7 @@ struct MergedTensorCoordinate
     {
         constexpr auto idim = IDim{};
 
-        // update multi-index
-        if(PositiveDirection)
-        {
-            mIndex(idim) += step_size;
-        }
-        else
-        {
-            mIndex(idim) -= step_size;
-        }
-
-        // update rest
+        // update original index
         static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
             constexpr auto partial_original_dims =
                 tensor_desc_type::GetContainedOriginalDimensions(idim);
@@ -253,19 +218,10 @@ struct MergedTensorCoordinate
 
             // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
             mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
-        }).Else([&](auto) {
-            constexpr auto idim_original =
-                tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
-
-            static_if<PositiveDirection>{}([&](auto fwd) {
-                mOriginalIndex(idim_original) += step_size;
-                mPartialOffsets(idim) += step_size * fwd(tensor_desc_type{}).GetStride(idim);
+        }).Else([&](auto fwd) {
+            static_if<PositiveDirection>{}([&](auto) {
                 mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
-            }).Else([&](auto fwd) {
-                mOriginalIndex(idim_original) -= step_size;
-                mPartialOffsets(idim) -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
-                mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim);
-            });
+            }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
         });
     }
 
@@ -273,10 +229,9 @@ struct MergedTensorCoordinate
     template <class T>
     __host__ __device__ type operator+=(T step_sizes)
     {
-#if 0
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
+
+        index_t normal_offset_diff = 0;
 
         static_for<0, nDim, 1>{}([&](auto idim) {
             this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
@@ -288,10 +243,7 @@ struct MergedTensorCoordinate
     template <class T>
     __host__ __device__ type operator-=(T step_sizes)
     {
-#if 0
-        static_assert(is_same<typename T::data_type, index_t>, "wrong!");
-#endif
-        static_assert(T::GetSize() == nDim, "wrong!");
+        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
 
         static_for<0, nDim, 1>{}([&](auto idim) {
             this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
@@ -316,33 +268,23 @@ struct MergedTensorCoordinate
         return coord;
     }
 
-    // reposition point of origin, and return compensated offset
-    __host__ __device__ constexpr index_t RepositionOrigin()
-    {
-        index_t offset_diff = 0;
+    __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
 
-        static_for<0, nDim, 1>{}([&](auto idim_) {
-            constexpr auto idim = decltype(idim_){};
-
-            static_if<!tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
-                constexpr auto idim_original =
-                    tensor_desc_type::GetContainedOriginalDimensions(idim).Front();
-
-                mIndex(idim)                  = 0;
-                mOriginalIndex(idim_original) = 0;
-                mOffset -= mPartialOffsets[idim];
-                offset_diff += mPartialOffsets[idim];
-                mPartialOffsets(idim) = 0;
-            });
-        });
-
-        return offset_diff;
-    }
-
-    // private:
-    Array<index_t, nDim> mIndex;
+    private:
+    // Allocate register memory for all merged dimensions and normal dimensions.
+    // However, only those merged dimensions, whose index will be involved in arithmetic
+    // after the construction of this TensorCoordinate (e.g. when user move a slicing
+    // window on the merged dimension), will use these register memory.
+    // Let's hope compiler will optimize away those register memory allocated for normal
+    // dimensions, and those merged dimensions, that would never be involved in index
+    // arithmetic after construction of TensorCoordinate.
+    // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
+    // and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
+    // count on compiler to optimize way those register memory for us
     Array<index_t, nOriginalDim> mOriginalIndex;
-    Array<index_t, nDim> mPartialOffsets; // mPartialOffsets is needed for for unsigned index type
+    Array<index_t, nDim> mPartialOffsets;
+
+    // complete offset
     index_t mOffset;
 };
 
diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
index 67395b978d..f79b17ae6d 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
     for(index_t i = 0; i < nrepeat; ++i)
     {
         constexpr auto gridwise_conv =
-#if 0
+#if 1
             GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
 #else
             GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
diff --git a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
index 1788af0cf1..f6e9560385 100644
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -85,6 +85,40 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
     using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
     using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
 
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // 1x1 filter, 8x8 image
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<2, 2>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<4, 64>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 1;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
     constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
     constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif
@@ -96,43 +130,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
 
     printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
 
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<GridSize,
+                                                            BlockSize,
+                                                            T,
+                                                            decltype(in_nchw_desc),
+                                                            decltype(wei_kcyx_desc),
+                                                            decltype(out_nkhw_desc),
+                                                            ConvStrides,
+                                                            ConvDilations,
+                                                            BPerBlock,
+                                                            KPerBlock,
+                                                            EPerBlock,
+                                                            GemmMPerThreadSubC,
+                                                            GemmNPerThreadSubC,
+                                                            GemmMLevel0Cluster,
+                                                            GemmNLevel0Cluster,
+                                                            GemmMLevel1Cluster,
+                                                            GemmNLevel1Cluster,
+                                                            GemmKPerThreadLoop,
+                                                            GemmDataPerReadA,
+                                                            GemmDataPerReadB,
+                                                            InBlockCopySubLengths_E_B,
+                                                            InBlockCopyClusterLengths_E_B,
+                                                            InBlockCopyThreadClusterArrangeOrder,
+                                                            InBlockCopySrcAccessOrder,
+                                                            InBlockCopyDstAccessOrder,
+                                                            InBlockCopyDataPerAccess_B,
+                                                            WeiBlockCopySubLengths_E_K,
+                                                            WeiBlockCopyClusterLengths_E_K,
+                                                            WeiBlockCopyThreadClusterArrangeOrder,
+                                                            WeiBlockCopySrcAccessOrder,
+                                                            WeiBlockCopyDstAccessOrder,
+                                                            WeiBlockCopySrcDataPerRead_E,
+                                                            WeiBlockCopyDstDataPerWrite_K>{};
+
     for(index_t i = 0; i < nrepeat; ++i)
     {
-        constexpr auto gridwise_conv = GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw<
-            GridSize,
-            BlockSize,
-            T,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            BPerBlock,
-            KPerBlock,
-            EPerBlock,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
-            GemmMLevel0Cluster,
-            GemmNLevel0Cluster,
-            GemmMLevel1Cluster,
-            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
-            GemmDataPerReadA,
-            GemmDataPerReadB,
-            InBlockCopySubLengths_E_B,
-            InBlockCopyClusterLengths_E_B,
-            InBlockCopyThreadClusterArrangeOrder,
-            InBlockCopySrcAccessOrder,
-            InBlockCopyDstAccessOrder,
-            InBlockCopyDataPerAccess_B,
-            WeiBlockCopySubLengths_E_K,
-            WeiBlockCopyClusterLengths_E_K,
-            WeiBlockCopyThreadClusterArrangeOrder,
-            WeiBlockCopySrcAccessOrder,
-            WeiBlockCopyDstAccessOrder,
-            WeiBlockCopySrcDataPerRead_E,
-            WeiBlockCopyDstDataPerWrite_K>{};
-
         float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
                                    dim3(GridSize),
                                    dim3(BlockSize),
diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp
index b118a55ae7..c9488b211a 100644
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -71,7 +71,7 @@ int main(int argc, char* argv[])
 {
     using namespace ck;
 
-#if 0
+#if 1
     constexpr index_t N  = 64;
     constexpr index_t C  = 1536;
     constexpr index_t HI = 8;