From 61ac08661d47ecf84f4e0afc59d3261c035d4226 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 11 Feb 2019 22:36:17 -0600
Subject: [PATCH] tune for 1x1

---
 driver/conv.cu                                |  10 +-
 ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh |  48 +++-
 src/include/blockwise_2d_tensor_op.cuh        |  34 +--
 src/include/blockwise_gemm.cuh                | 240 ++++++++++++++----
 ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh |   8 +-
 ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh |  14 +-
 ...ion_2_cnhw_csrk_knhw_lds_double_buffer.cuh |  18 +-
 src/include/threadwise_gemm.cuh               |  29 ++-
 8 files changed, 284 insertions(+), 117 deletions(-)
diff --git a/driver/conv.cu b/driver/conv.cu
index 249e9d0562..5d039a4f86 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -391,7 +391,7 @@ int main()
 
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
     // 3x3, 34x34
     constexpr unsigned N = 64;
     constexpr unsigned C = 256;
@@ -490,7 +490,7 @@ int main()
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
-#elif 0
+#elif 1
     // 1x1 filter, 28x28 image
     constexpr unsigned N  = 16;
     constexpr unsigned C  = 256;
@@ -582,7 +582,7 @@ int main()
     wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #endif
 
-    unsigned nrepeat = 100;
+    unsigned nrepeat = 200;
 
 #if 1
 #if 0
@@ -593,11 +593,11 @@ int main()
     device_implicit_gemm_convolution_1_nchw_kcsr
 #elif 0
     device_implicit_gemm_convolution_1_nchw_srck_nkhw
-#elif 1
+#elif 0
     device_implicit_gemm_convolution_1_chwn_csrk_khwn
 #elif 0
     device_implicit_gemm_convolution_2_cnhw_srck_knhw
-#elif 0
+#elif 1
     device_implicit_gemm_convolution_2_cnhw_csrk_knhw
 #endif
     (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
index de439b54bf..e801e15b2b 100644
--- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));
 
-#if 1
+#if 0
     // 3x3, 34x34
     constexpr unsigned BPerBlock = 128;
     constexpr unsigned KPerBlock = 64;
@@ -78,9 +78,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     constexpr unsigned GemmMPerThreadSubC = 4;
     constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
     constexpr unsigned GemmNLevel0Cluster = 2;
-    constexpr unsigned GemmMLevel1Cluster = 1;
+    constexpr unsigned GemmMLevel1Cluster = 2;
     constexpr unsigned GemmNLevel1Cluster = 8;
     constexpr unsigned GemmKPerThreadLoop = 1;
 
@@ -98,7 +98,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     constexpr unsigned BlockSize = 128;
 #elif 0
-    // 1x1, 28x28
+    // 1x1, 28x28, 64 threads
     constexpr unsigned BPerBlock = 64;
     constexpr unsigned KPerBlock = 64;
     constexpr unsigned CPerBlock = 8;
@@ -108,9 +108,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     constexpr unsigned GemmMPerThreadSubC = 4;
     constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
     constexpr unsigned GemmNLevel0Cluster = 2;
-    constexpr unsigned GemmMLevel1Cluster = 1;
+    constexpr unsigned GemmMLevel1Cluster = 2;
     constexpr unsigned GemmNLevel1Cluster = 4;
     constexpr unsigned GemmKPerThreadLoop = 1;
 
@@ -128,7 +128,37 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     constexpr unsigned BlockSize = 64;
 #elif 1
-    // 1x1, 28x28 try
+    // 1x1, 28x28, 128 threads
+    constexpr unsigned BPerBlock = 64;
+    constexpr unsigned KPerBlock = 128;
+    constexpr unsigned CPerBlock = 8;
+
+    constexpr unsigned BPerThread = 8;
+    constexpr unsigned KPerThread = 8;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 4;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned GemmThreadPerColumnPerCluster = 8;
+    constexpr unsigned GemmThreadPerRowPerCluster    = 8;
+
+    constexpr unsigned InBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned InBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 4;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 1x1, 28x28, 256 thread
     constexpr unsigned BPerBlock = 128;
     constexpr unsigned KPerBlock = 128;
     constexpr unsigned CPerBlock = 8;
@@ -138,9 +168,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
 
     constexpr unsigned GemmMPerThreadSubC = 4;
     constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
     constexpr unsigned GemmNLevel0Cluster = 4;
-    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 4;
     constexpr unsigned GemmNLevel1Cluster = 4;
     constexpr unsigned GemmKPerThreadLoop = 1;
 
diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh
index 593b761e43..1231a3863b 100644
--- a/src/include/blockwise_2d_tensor_op.cuh
+++ b/src/include/blockwise_2d_tensor_op.cuh
@@ -70,7 +70,7 @@ template <unsigned BlockSize,
           class F>
 __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
     SrcDesc,
-    Float* const __restrict__ p_src,
+    const Float* __restrict__ p_src,
     DstDesc,
     Float* __restrict__ p_dst,
     SrcOpLengths,
@@ -149,7 +149,7 @@ template <unsigned BlockSize,
           class DstFromSrcReorder>
 __device__ void
 blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
-                                                     Float* const __restrict__ p_src,
+                                                     const Float* __restrict__ p_src,
                                                      DstDesc,
                                                      Float* __restrict__ p_dst,
                                                      SrcOpLengths,
@@ -164,7 +164,7 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
 template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
 struct Blockwise2dTensorCopy1
 {
-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
         constexpr auto dst_from_src_reorder = Sequence<0, 1>{};
 
@@ -199,7 +199,7 @@ struct Blockwise2dTensorCopy2
         mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1;
     }
 
-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
         static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
 
@@ -253,7 +253,7 @@ struct Blockwise2dTensorCopy2
                 const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
 
                 *(reinterpret_cast<Float4*>(p_dst + dindex)) =
-                    *(reinterpret_cast<Float4*>(p_src + sindex));
+                    *(reinterpret_cast<const Float4*>(p_src + sindex));
             }
 
             // v2
@@ -266,7 +266,7 @@ struct Blockwise2dTensorCopy2
                 const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
 
                 *(reinterpret_cast<Float2*>(p_dst + dindex)) =
-                    *(reinterpret_cast<Float2*>(p_src + sindex));
+                    *(reinterpret_cast<const Float2*>(p_src + sindex));
             }
 
             // v1
@@ -314,7 +314,7 @@ struct Blockwise2dTensorCopy2
                     const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
 
                     *(reinterpret_cast<Float4*>(p_dst + dindex)) =
-                        *(reinterpret_cast<Float4*>(p_src + sindex));
+                        *(reinterpret_cast<const Float4*>(p_src + sindex));
                 }
 
                 // v2
@@ -327,7 +327,7 @@ struct Blockwise2dTensorCopy2
                     const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
 
                     *(reinterpret_cast<Float2*>(p_dst + dindex)) =
-                        *(reinterpret_cast<Float2*>(p_src + sindex));
+                        *(reinterpret_cast<const Float2*>(p_src + sindex));
                 }
 
                 // v1
@@ -422,7 +422,7 @@ struct Blockwise2dTensorCopy3
         mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
     }
 
-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
         static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
 
@@ -463,14 +463,14 @@ struct Blockwise2dTensorCopy3
             else if(DataPerRead == 2)
             {
                 *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<Float2*>(p_src + mSrcMyThreadOffset +
-                                                iloop * src_loop_stride));
+                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
+                                                      iloop * src_loop_stride));
             }
             else if(DataPerRead == 4)
             {
                 *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<Float4*>(p_src + mSrcMyThreadOffset +
-                                                iloop * src_loop_stride));
+                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
+                                                      iloop * src_loop_stride));
             }
             else
             {
@@ -495,15 +495,15 @@ struct Blockwise2dTensorCopy3
                 {
                     *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset +
                                                 nloop_d0 * dst_loop_stride)) =
-                        *(reinterpret_cast<Float2*>(p_src + mSrcMyThreadOffset +
-                                                    nloop_d0 * src_loop_stride));
+                        *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
+                                                          nloop_d0 * src_loop_stride));
                 }
                 else if(DataPerRead == 4)
                 {
                     *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset +
                                                 nloop_d0 * dst_loop_stride)) =
-                        *(reinterpret_cast<Float4*>(p_src + mSrcMyThreadOffset +
-                                                    nloop_d0 * src_loop_stride));
+                        *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
+                                                          nloop_d0 * src_loop_stride));
                 }
                 else
                 {
diff --git a/src/include/blockwise_gemm.cuh b/src/include/blockwise_gemm.cuh
index ef5c092de4..802bade82c 100644
--- a/src/include/blockwise_gemm.cuh
+++ b/src/include/blockwise_gemm.cuh
@@ -29,8 +29,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
 
     __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC()
     {
-        const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-        const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx = BlockMatrixA{};
+        constexpr auto b_block_mtx = BlockMatrixB{};
 
         const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
 
@@ -66,8 +66,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
 
         if(TransA && (!TransB) && (!TransC))
         {
-            const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-            const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+            constexpr auto a_block_mtx = BlockMatrixA{};
+            constexpr auto b_block_mtx = BlockMatrixB{};
 
             static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                           "wrong! k dimension not consistent!");
@@ -75,7 +75,7 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
             constexpr unsigned MPerBlock = a_block_mtx.NCol();
             constexpr unsigned NPerBlock = b_block_mtx.NCol();
 
-            const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+            constexpr auto c_thread_mtx = ThreadMatrixC{};
 
             // divide thread work
             constexpr unsigned MPerThread = c_thread_mtx.NRow();
@@ -117,9 +117,9 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
     }
 
     template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                         Accumulator f_accum) const
     {
         if(TransA && (!TransB) && (!TransC))
@@ -243,8 +243,8 @@ struct BlockwiseGemmBlockABlockBThreadC
 
     __device__ BlockwiseGemmBlockABlockBThreadC()
     {
-        const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-        const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx = BlockMatrixA{};
+        constexpr auto b_block_mtx = BlockMatrixB{};
 
         const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
 
@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC
 
         if(TransA && (!TransB) && (!TransC))
         {
-            constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-            constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+            constexpr auto a_block_mtx = BlockMatrixA{};
+            constexpr auto b_block_mtx = BlockMatrixB{};
 
             static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                           "wrong! k dimension not consistent!");
@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
             constexpr unsigned MPerBlock = a_block_mtx.NCol();
             constexpr unsigned NPerBlock = b_block_mtx.NCol();
 
-            constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+            constexpr auto c_thread_mtx = ThreadMatrixC{};
 
             // divide thread work
             constexpr unsigned MPerThread = c_thread_mtx.NRow();
@@ -367,9 +367,9 @@ struct BlockwiseGemmBlockABlockBThreadC
     }
 
     template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                         Accumulator f_accum) const
     {
         if(TransA && (!TransB) && (!TransC))
@@ -459,9 +459,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
 
         static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");
 
-        const auto a_block_mtx  = BlockMatrixA{};  // constexpr doesn't compile
-        const auto b_block_mtx  = BlockMatrixB{};  // constexpr doesn't compile
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
 
         static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                       "wrong! K dimension not consistent\n");
@@ -529,7 +529,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(unsigned m_in_c,
                                                                       unsigned n_in_c)
     {
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
 
         constexpr unsigned MPerThread = c_thread_mtx.NRow();
         constexpr unsigned NPerThread = c_thread_mtx.NCol();
@@ -551,17 +551,17 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     }
 
     template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                         Accumulator f_accum) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
 
-        const auto a_block_mtx  = BlockMatrixA{};  // constexpr doesn't compile
-        const auto b_block_mtx  = BlockMatrixB{};  // constexpr doesn't compile
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
 
         constexpr unsigned M = a_block_mtx.NCol();
         constexpr unsigned N = b_block_mtx.NCol();
@@ -571,22 +571,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
         constexpr unsigned NPerThread = c_thread_mtx.NCol();
 
         // thread A, B for GEMM
-        const auto a_thread_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
+        constexpr auto a_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
 
-        const auto b_thread_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
+        constexpr auto b_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
 
         // thread A-sub, B-sub for copy
-        const auto a_thread_sub_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
-                                          Number<MPerThreadSubC>{},
-                                          Number<MPerThread>{}); // constexpr doesn't compile
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
 
-        const auto b_thread_sub_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
-                                          Number<NPerThreadSubC>{},
-                                          Number<NPerThread>{}); // constexpr doesn't compile
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
 
         FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
         FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
@@ -606,26 +602,26 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
             // copy A-sub to form A
             for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
             {
-                threadwise_matrix_copy(a_block_mtx,
-                                       p_a_block + mMyThreadOffsetA +
-                                           k_begin * a_block_mtx.RowStride() +
-                                           m_repeat * MPerLevel1Cluster,
-                                       a_thread_sub_mtx,
-                                       p_a_thread + m_repeat * MPerThreadSubC,
-                                       a_thread_sub_mtx.GetLengths());
+                threadwise_matrix_copy(
+                    a_block_mtx,
+                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                        mMyThreadOffsetA,
+                    a_thread_mtx,
+                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                    a_thread_sub_mtx.GetLengths());
             }
 
 #pragma unroll
             // copy B-sub to form B
             for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
-                threadwise_matrix_copy(b_block_mtx,
-                                       p_b_block + mMyThreadOffsetB +
-                                           k_begin * b_block_mtx.RowStride() +
-                                           n_repeat * NPerLevel1Cluster,
-                                       b_thread_sub_mtx,
-                                       p_b_thread + n_repeat * NPerThreadSubC,
-                                       b_thread_sub_mtx.GetLengths());
+                threadwise_matrix_copy(
+                    b_block_mtx,
+                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                        mMyThreadOffsetB,
+                    b_thread_mtx,
+                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                    b_thread_sub_mtx.GetLengths());
             }
 
             // C = A * B
@@ -778,4 +774,144 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             f_accum);
         }
     }
+
+    template <class FloatA, class FloatB, class FloatC, class Accumulator>
+    __device__ void Run_v2(const FloatA* __restrict__ p_a_block,
+                           const FloatB* __restrict__ p_b_block,
+                           FloatC* __restrict__ p_c_thread,
+                           Accumulator f_accum) const
+    {
+        constexpr auto True  = integral_constant<bool, true>{};
+        constexpr auto False = integral_constant<bool, false>{};
+
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
+
+        constexpr unsigned M = a_block_mtx.NCol();
+        constexpr unsigned N = b_block_mtx.NCol();
+        constexpr unsigned K = a_block_mtx.NRow();
+
+        constexpr unsigned MPerThread = c_thread_mtx.NRow();
+        constexpr unsigned NPerThread = c_thread_mtx.NCol();
+
+        // thread A-sub, B-sub, C-sub
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+
+        constexpr auto c_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+
+        // thread A, B
+        constexpr auto a_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
+
+        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
+
+        constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
+        constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
+
+        constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
+        constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
+
+#pragma unroll
+        // loop over k
+        for(unsigned k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
+        {
+            // C-sub(s) in first row-wise subblock of C
+            {
+                //   copy first A-sub
+                threadwise_matrix_copy(a_block_mtx,
+                                       p_a_block + a_block_mtx.Get1dIndex(k_begin, 0) +
+                                           mMyThreadOffsetA,
+                                       a_thread_mtx,
+                                       p_a_thread,
+                                       a_thread_sub_mtx.GetLengths());
+
+                //   copy first B-sub
+                threadwise_matrix_copy(b_block_mtx,
+                                       p_b_block + b_block_mtx.Get1dIndex(k_begin, 0) +
+                                           mMyThreadOffsetB,
+                                       b_thread_mtx,
+                                       p_b_thread,
+                                       b_thread_sub_mtx.GetLengths());
+
+                //   do first sub GEMM
+                threadwise_gemm(a_thread_sub_mtx,
+                                True,
+                                p_a_thread,
+                                b_thread_sub_mtx,
+                                False,
+                                p_b_thread,
+                                c_thread_sub_mtx,
+                                False,
+                                p_c_thread,
+                                f_accum);
+
+#pragma unroll
+                //   copy next B-sub, and do GEMM
+                for(unsigned n_repeat = 1; n_repeat < NRepeat; ++n_repeat)
+                {
+                    threadwise_matrix_copy(
+                        b_block_mtx,
+                        p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                            mMyThreadOffsetB,
+                        b_thread_mtx,
+                        p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        b_thread_sub_mtx.GetLengths());
+
+                    threadwise_gemm(
+                        a_thread_sub_mtx,
+                        True,
+                        p_a_thread,
+                        b_thread_sub_mtx,
+                        False,
+                        p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        c_thread_sub_mtx,
+                        False,
+                        p_c_thread + c_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        f_accum);
+                }
+
+#pragma unroll
+                // loop over rest of row-wise subblock
+                //   all B-sub(s) has been copied, so only A-sub(s) need to be copied
+                for(unsigned m_repeat = 1; m_repeat < MRepeat; ++m_repeat)
+                {
+                    // copy a A-sub
+                    threadwise_matrix_copy(
+                        a_block_mtx,
+                        p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                            mMyThreadOffsetA,
+                        a_thread_mtx,
+                        p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                        a_thread_sub_mtx.GetLengths());
+
+                    // do some GEMMs
+                    for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
+                    {
+                        threadwise_gemm(
+                            a_thread_sub_mtx,
+                            True,
+                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                            b_thread_sub_mtx,
+                            False,
+                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                            c_thread_sub_mtx,
+                            False,
+                            p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
+                                                                 n_repeat * NPerThreadSubC),
+                            f_accum);
+                    }
+                }
+            }
+        }
+    }
 };
diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
index 326e5939ef..bc58bf77ad 100644
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
@@ -201,11 +201,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
     // set threadwise output tensor to 0
     threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
 
-    Float* p_in_global_block_begin =
+    const Float* p_in_global_block_begin =
         p_in_global + in_chwn_global_desc.Get1dIndex(
                           0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
 
-    Float* p_wei_global_block_begin =
+    const Float* p_wei_global_block_begin =
         p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
 
     for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
@@ -213,15 +213,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
                  p_wei_global_block_begin += CPerBlock * wei_csrk_global_desc.GetStride(I0),
                  __syncthreads())
     {
-#if 1
         // input: global mem to LDS,
         blockwise_in_copy.Run(p_in_global_block_begin, p_in_block);
-#endif
 
-#if 1
         // weight: global mem to LDS,
         blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block);
-#endif
 
         __syncthreads();
 
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
index 5e9bbd7a29..b2533448f6 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -36,11 +36,11 @@ template <unsigned GridSize,
           unsigned WeiBlockCopyDataPerRead>
 __global__ void
 gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
-                                                    Float* const __restrict__ p_in_global,
+                                                    const Float* const __restrict__ p_in_global,
                                                     WeiGlobalDesc,
-                                                    Float* const __restrict__ p_wei_global,
+                                                    const Float* const __restrict__ p_wei_global,
                                                     OutGlobalDesc,
-                                                    Float* __restrict__ p_out_global)
+                                                    Float* const __restrict__ p_out_global)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -228,10 +228,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
     // set threadwise output tensor to 0
     threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
 
-    Float* p_in_global_block_offset =
+    const Float* p_in_global_block_offset =
         p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
 
-    Float* p_wei_global_block_offset =
+    const Float* p_wei_global_block_offset =
         p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
 
     for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
@@ -256,7 +256,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
 
 #if 1
                 blockwise_gemm.Run
-#else
+#elif 0
+                blockwise_gemm.Run_v2
+#elif 0
                 blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
                     (p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
index fd021957dc..7e10a62faf 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
@@ -123,7 +123,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
                                decltype(in_cb_global_desc),
                                decltype(in_cb_block_desc),
                                decltype(in_cb_block_desc.GetLengths())>{};
-#elif 1
+#elif 0
     const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
@@ -149,7 +149,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
                                decltype(wei_ek_global_desc),
                                decltype(wei_ek_block_desc),
                                decltype(wei_ek_block_desc.GetLengths())>{};
-#elif 1
+#elif 0
     const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
                                                            Float,
                                                            decltype(wei_ek_global_desc),
@@ -226,10 +226,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
     __shared__ Float p_in_block_1[max_align * ((in_block_size + max_align - 1) / max_align)];
     __shared__ Float p_wei_block_1[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
-    Float* p_in_global_block_offset =
+    const Float* p_in_global_block_offset =
         p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
 
-    Float* p_wei_global_block_offset =
+    const Float* p_wei_global_block_offset =
         p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
 
     // preload data into LDS
@@ -272,15 +272,15 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
             for(unsigned r = 0; r < R; ++r)
             {
                 auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                 blockwise_gemm.Run
 #else
                 blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
-                 p_in_block_now + s * Wi + r,
-                 p_out_thread,
-                 f_accum);
+                    (p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
+                     p_in_block_now + s * Wi + r,
+                     p_out_thread,
+                     f_accum);
             }
         }
     }
diff --git a/src/include/threadwise_gemm.cuh b/src/include/threadwise_gemm.cuh
index baebad7de0..49261fce7d 100644
--- a/src/include/threadwise_gemm.cuh
+++ b/src/include/threadwise_gemm.cuh
@@ -1,11 +1,14 @@
 #pragma once
 
 template <class Float, class SrcMatrix, class DstMatrix, unsigned NRow, unsigned NCol>
-__device__ void
-threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence<NRow, NCol>)
+__device__ void threadwise_matrix_copy(SrcMatrix,
+                                       const Float* __restrict__ p_src,
+                                       DstMatrix,
+                                       Float* __restrict__ p_dst,
+                                       Sequence<NRow, NCol>)
 {
-    const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile
-    const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile
+    constexpr auto src_mtx = SrcMatrix{};
+    constexpr auto dst_mtx = DstMatrix{};
 
     for(unsigned i = 0; i < NRow; ++i)
     {
@@ -31,30 +34,30 @@ template <class MatrixA,
           class Accumulator>
 __device__ void threadwise_gemm(MatrixA,
                                 integral_constant<bool, TransA>,
-                                FloatA* const p_a_thread,
+                                const FloatA* __restrict__ p_a_thread,
                                 MatrixB,
                                 integral_constant<bool, TransB>,
-                                FloatB* const p_b_thread,
+                                const FloatB* __restrict__ p_b_thread,
                                 MatrixC,
                                 integral_constant<bool, TransC>,
-                                FloatC* p_c_thread,
+                                FloatC* __restrict__ p_c_thread,
                                 Accumulator f_accum)
 {
     if(TransA && (!TransB) && (!TransC))
     {
-        const auto a_mtx = MatrixA{}; // constexpr doesn't compile
-        const auto b_mtx = MatrixB{}; // constexpr doesn't compile
-        const auto c_mtx = MatrixC{}; // constexpr doesn't compile
+        constexpr auto a_mtx = MatrixA{};
+        constexpr auto b_mtx = MatrixB{};
+        constexpr auto c_mtx = MatrixC{};
 
         constexpr unsigned M = c_mtx.NRow();
         constexpr unsigned N = c_mtx.NCol();
         constexpr unsigned K = a_mtx.NRow(); // A is transposed
 
-        for(unsigned i = 0; i < M; ++i)
+        for(unsigned k = 0; k < K; ++k)
         {
-            for(unsigned j = 0; j < N; ++j)
+            for(unsigned i = 0; i < M; ++i)
             {
-                for(unsigned k = 0; k < K; ++k)
+                for(unsigned j = 0; j < N; ++j)
                 {
                     const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
                     const unsigned bindex = b_mtx.Get1dIndex(k, j);