tune for 1x1

2026-05-11 08:50:17 +00:00 · 2019-02-11 22:36:17 -06:00
parent abf75ac039
commit 61ac08661d
8 changed files with 284 additions and 117 deletions
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -391,7 +391,7 @@ int main()

    constexpr unsigned HPad = 0;
    constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
    // 3x3, 34x34
    constexpr unsigned N = 64;
    constexpr unsigned C = 256;
@@ -490,7 +490,7 @@ int main()

    constexpr unsigned HPad = 1;
    constexpr unsigned WPad = 1;
-#elif 0
+#elif 1
    // 1x1 filter, 28x28 image
    constexpr unsigned N  = 16;
    constexpr unsigned C  = 256;
@@ -582,7 +582,7 @@ int main()
    wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #endif

-    unsigned nrepeat = 100;
+    unsigned nrepeat = 200;

 #if 1
 #if 0
@@ -593,11 +593,11 @@ int main()
    device_implicit_gemm_convolution_1_nchw_kcsr
 #elif 0
    device_implicit_gemm_convolution_1_nchw_srck_nkhw
-#elif 1
+#elif 0
    device_implicit_gemm_convolution_1_chwn_csrk_khwn
 #elif 0
    device_implicit_gemm_convolution_2_cnhw_srck_knhw
-#elif 0
+#elif 1
    device_implicit_gemm_convolution_2_cnhw_csrk_knhw
 #endif
    (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
--- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));

-#if 1
+#if 0
    // 3x3, 34x34
    constexpr unsigned BPerBlock = 128;
    constexpr unsigned KPerBlock = 64;
@@ -78,9 +78,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    constexpr unsigned GemmMPerThreadSubC = 4;
    constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
    constexpr unsigned GemmNLevel0Cluster = 2;
-    constexpr unsigned GemmMLevel1Cluster = 1;
+    constexpr unsigned GemmMLevel1Cluster = 2;
    constexpr unsigned GemmNLevel1Cluster = 8;
    constexpr unsigned GemmKPerThreadLoop = 1;

@@ -98,7 +98,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    constexpr unsigned BlockSize = 128;
 #elif 0
-    // 1x1, 28x28
+    // 1x1, 28x28, 64 threads
    constexpr unsigned BPerBlock = 64;
    constexpr unsigned KPerBlock = 64;
    constexpr unsigned CPerBlock = 8;
@@ -108,9 +108,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    constexpr unsigned GemmMPerThreadSubC = 4;
    constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
    constexpr unsigned GemmNLevel0Cluster = 2;
-    constexpr unsigned GemmMLevel1Cluster = 1;
+    constexpr unsigned GemmMLevel1Cluster = 2;
    constexpr unsigned GemmNLevel1Cluster = 4;
    constexpr unsigned GemmKPerThreadLoop = 1;

@@ -128,7 +128,37 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    constexpr unsigned BlockSize = 64;
 #elif 1
-    // 1x1, 28x28 try
+    // 1x1, 28x28, 128 threads
+    constexpr unsigned BPerBlock = 64;
+    constexpr unsigned KPerBlock = 128;
+    constexpr unsigned CPerBlock = 8;
+
+    constexpr unsigned BPerThread = 8;
+    constexpr unsigned KPerThread = 8;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 4;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned GemmThreadPerColumnPerCluster = 8;
+    constexpr unsigned GemmThreadPerRowPerCluster    = 8;
+
+    constexpr unsigned InBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned InBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
+    constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 4;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 1x1, 28x28, 256 thread
    constexpr unsigned BPerBlock = 128;
    constexpr unsigned KPerBlock = 128;
    constexpr unsigned CPerBlock = 8;
@@ -138,9 +168,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,

    constexpr unsigned GemmMPerThreadSubC = 4;
    constexpr unsigned GemmNPerThreadSubC = 4;
-    constexpr unsigned GemmMLevel0Cluster = 8;
+    constexpr unsigned GemmMLevel0Cluster = 4;
    constexpr unsigned GemmNLevel0Cluster = 4;
-    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 4;
    constexpr unsigned GemmNLevel1Cluster = 4;
    constexpr unsigned GemmKPerThreadLoop = 1;

--- a/src/include/blockwise_2d_tensor_op.cuh
+++ b/src/include/blockwise_2d_tensor_op.cuh
@@ -70,7 +70,7 @@ template <unsigned BlockSize,
          class F>
 __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
    SrcDesc,
-    Float* const __restrict__ p_src,
+    const Float* __restrict__ p_src,
    DstDesc,
    Float* __restrict__ p_dst,
    SrcOpLengths,
@@ -149,7 +149,7 @@ template <unsigned BlockSize,
          class DstFromSrcReorder>
 __device__ void
 blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
-                                                     Float* const __restrict__ p_src,
+                                                     const Float* __restrict__ p_src,
                                                     DstDesc,
                                                     Float* __restrict__ p_dst,
                                                     SrcOpLengths,
@@ -164,7 +164,7 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
 template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
 struct Blockwise2dTensorCopy1
 {
-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        constexpr auto dst_from_src_reorder = Sequence<0, 1>{};

@@ -199,7 +199,7 @@ struct Blockwise2dTensorCopy2
        mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1;
    }

-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");

@@ -253,7 +253,7 @@ struct Blockwise2dTensorCopy2
                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                *(reinterpret_cast<Float4*>(p_dst + dindex)) =
-                    *(reinterpret_cast<Float4*>(p_src + sindex));
+                    *(reinterpret_cast<const Float4*>(p_src + sindex));
            }

            // v2
@@ -266,7 +266,7 @@ struct Blockwise2dTensorCopy2
                const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                *(reinterpret_cast<Float2*>(p_dst + dindex)) =
-                    *(reinterpret_cast<Float2*>(p_src + sindex));
+                    *(reinterpret_cast<const Float2*>(p_src + sindex));
            }

            // v1
@@ -314,7 +314,7 @@ struct Blockwise2dTensorCopy2
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<Float4*>(p_dst + dindex)) =
-                        *(reinterpret_cast<Float4*>(p_src + sindex));
+                        *(reinterpret_cast<const Float4*>(p_src + sindex));
                }

                // v2
@@ -327,7 +327,7 @@ struct Blockwise2dTensorCopy2
                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1);

                    *(reinterpret_cast<Float2*>(p_dst + dindex)) =
-                        *(reinterpret_cast<Float2*>(p_src + sindex));
+                        *(reinterpret_cast<const Float2*>(p_src + sindex));
                }

                // v1
@@ -422,7 +422,7 @@ struct Blockwise2dTensorCopy3
        mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead);
    }

-    __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const
+    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");

@@ -463,14 +463,14 @@ struct Blockwise2dTensorCopy3
            else if(DataPerRead == 2)
            {
                *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<Float2*>(p_src + mSrcMyThreadOffset +
-                                                iloop * src_loop_stride));
+                    *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
+                                                      iloop * src_loop_stride));
            }
            else if(DataPerRead == 4)
            {
                *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                    *(reinterpret_cast<Float4*>(p_src + mSrcMyThreadOffset +
-                                                iloop * src_loop_stride));
+                    *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
+                                                      iloop * src_loop_stride));
            }
            else
            {
@@ -495,15 +495,15 @@ struct Blockwise2dTensorCopy3
                {
                    *(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset +
                                                nloop_d0 * dst_loop_stride)) =
-                        *(reinterpret_cast<Float2*>(p_src + mSrcMyThreadOffset +
-                                                    nloop_d0 * src_loop_stride));
+                        *(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
+                                                          nloop_d0 * src_loop_stride));
                }
                else if(DataPerRead == 4)
                {
                    *(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset +
                                                nloop_d0 * dst_loop_stride)) =
-                        *(reinterpret_cast<Float4*>(p_src + mSrcMyThreadOffset +
-                                                    nloop_d0 * src_loop_stride));
+                        *(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
+                                                          nloop_d0 * src_loop_stride));
                }
                else
                {
--- a/src/include/blockwise_gemm.cuh
+++ b/src/include/blockwise_gemm.cuh
@@ -29,8 +29,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC

    __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC()
    {
-        const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-        const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx = BlockMatrixA{};
+        constexpr auto b_block_mtx = BlockMatrixB{};

        const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());

@@ -66,8 +66,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC

        if(TransA && (!TransB) && (!TransC))
        {
-            const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-            const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+            constexpr auto a_block_mtx = BlockMatrixA{};
+            constexpr auto b_block_mtx = BlockMatrixB{};

            static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                          "wrong! k dimension not consistent!");
@@ -75,7 +75,7 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
            constexpr unsigned MPerBlock = a_block_mtx.NCol();
            constexpr unsigned NPerBlock = b_block_mtx.NCol();

-            const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+            constexpr auto c_thread_mtx = ThreadMatrixC{};

            // divide thread work
            constexpr unsigned MPerThread = c_thread_mtx.NRow();
@@ -117,9 +117,9 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
    }

    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                        Accumulator f_accum) const
    {
        if(TransA && (!TransB) && (!TransC))
@@ -243,8 +243,8 @@ struct BlockwiseGemmBlockABlockBThreadC

    __device__ BlockwiseGemmBlockABlockBThreadC()
    {
-        const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-        const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx = BlockMatrixA{};
+        constexpr auto b_block_mtx = BlockMatrixB{};

        const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());

@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC

        if(TransA && (!TransB) && (!TransC))
        {
-            constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
-            constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
+            constexpr auto a_block_mtx = BlockMatrixA{};
+            constexpr auto b_block_mtx = BlockMatrixB{};

            static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                          "wrong! k dimension not consistent!");
@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
            constexpr unsigned MPerBlock = a_block_mtx.NCol();
            constexpr unsigned NPerBlock = b_block_mtx.NCol();

-            constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+            constexpr auto c_thread_mtx = ThreadMatrixC{};

            // divide thread work
            constexpr unsigned MPerThread = c_thread_mtx.NRow();
@@ -367,9 +367,9 @@ struct BlockwiseGemmBlockABlockBThreadC
    }

    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                        Accumulator f_accum) const
    {
        if(TransA && (!TransB) && (!TransC))
@@ -459,9 +459,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2

        static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");

-        const auto a_block_mtx  = BlockMatrixA{};  // constexpr doesn't compile
-        const auto b_block_mtx  = BlockMatrixB{};  // constexpr doesn't compile
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};

        static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
                      "wrong! K dimension not consistent\n");
@@ -529,7 +529,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(unsigned m_in_c,
                                                                      unsigned n_in_c)
    {
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto c_thread_mtx = ThreadMatrixC{};

        constexpr unsigned MPerThread = c_thread_mtx.NRow();
        constexpr unsigned NPerThread = c_thread_mtx.NCol();
@@ -551,17 +551,17 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    }

    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(FloatA* const p_a_block,
-                        FloatB* const p_b_block,
-                        FloatC* p_c_thread,
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread,
                        Accumulator f_accum) const
    {
        constexpr auto True  = integral_constant<bool, true>{};
        constexpr auto False = integral_constant<bool, false>{};

-        const auto a_block_mtx  = BlockMatrixA{};  // constexpr doesn't compile
-        const auto b_block_mtx  = BlockMatrixB{};  // constexpr doesn't compile
-        const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};

        constexpr unsigned M = a_block_mtx.NCol();
        constexpr unsigned N = b_block_mtx.NCol();
@@ -571,22 +571,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr unsigned NPerThread = c_thread_mtx.NCol();

        // thread A, B for GEMM
-        const auto a_thread_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
+        constexpr auto a_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});

-        const auto b_thread_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
+        constexpr auto b_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});

        // thread A-sub, B-sub for copy
-        const auto a_thread_sub_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
-                                          Number<MPerThreadSubC>{},
-                                          Number<MPerThread>{}); // constexpr doesn't compile
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});

-        const auto b_thread_sub_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
-                                          Number<NPerThreadSubC>{},
-                                          Number<NPerThread>{}); // constexpr doesn't compile
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});

        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
@@ -606,26 +602,26 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
            // copy A-sub to form A
            for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
            {
-                threadwise_matrix_copy(a_block_mtx,
-                                       p_a_block + mMyThreadOffsetA +
-                                           k_begin * a_block_mtx.RowStride() +
-                                           m_repeat * MPerLevel1Cluster,
-                                       a_thread_sub_mtx,
-                                       p_a_thread + m_repeat * MPerThreadSubC,
-                                       a_thread_sub_mtx.GetLengths());
+                threadwise_matrix_copy(
+                    a_block_mtx,
+                    p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                        mMyThreadOffsetA,
+                    a_thread_mtx,
+                    p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                    a_thread_sub_mtx.GetLengths());
            }

 #pragma unroll
            // copy B-sub to form B
            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
            {
-                threadwise_matrix_copy(b_block_mtx,
-                                       p_b_block + mMyThreadOffsetB +
-                                           k_begin * b_block_mtx.RowStride() +
-                                           n_repeat * NPerLevel1Cluster,
-                                       b_thread_sub_mtx,
-                                       p_b_thread + n_repeat * NPerThreadSubC,
-                                       b_thread_sub_mtx.GetLengths());
+                threadwise_matrix_copy(
+                    b_block_mtx,
+                    p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                        mMyThreadOffsetB,
+                    b_thread_mtx,
+                    p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                    b_thread_sub_mtx.GetLengths());
            }

            // C = A * B
@@ -778,4 +774,144 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            f_accum);
        }
    }
+
+    template <class FloatA, class FloatB, class FloatC, class Accumulator>
+    __device__ void Run_v2(const FloatA* __restrict__ p_a_block,
+                           const FloatB* __restrict__ p_b_block,
+                           FloatC* __restrict__ p_c_thread,
+                           Accumulator f_accum) const
+    {
+        constexpr auto True  = integral_constant<bool, true>{};
+        constexpr auto False = integral_constant<bool, false>{};
+
+        constexpr auto a_block_mtx  = BlockMatrixA{};
+        constexpr auto b_block_mtx  = BlockMatrixB{};
+        constexpr auto c_thread_mtx = ThreadMatrixC{};
+
+        constexpr unsigned M = a_block_mtx.NCol();
+        constexpr unsigned N = b_block_mtx.NCol();
+        constexpr unsigned K = a_block_mtx.NRow();
+
+        constexpr unsigned MPerThread = c_thread_mtx.NRow();
+        constexpr unsigned NPerThread = c_thread_mtx.NCol();
+
+        // thread A-sub, B-sub, C-sub
+        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+
+        constexpr auto c_thread_sub_mtx = make_ConstantMatrixDescriptor(
+            Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
+
+        // thread A, B
+        constexpr auto a_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
+
+        constexpr auto b_thread_mtx =
+            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
+
+        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
+        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
+
+        constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
+        constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
+
+        constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
+        constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
+
+#pragma unroll
+        // loop over k
+        for(unsigned k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
+        {
+            // C-sub(s) in first row-wise subblock of C
+            {
+                //   copy first A-sub
+                threadwise_matrix_copy(a_block_mtx,
+                                       p_a_block + a_block_mtx.Get1dIndex(k_begin, 0) +
+                                           mMyThreadOffsetA,
+                                       a_thread_mtx,
+                                       p_a_thread,
+                                       a_thread_sub_mtx.GetLengths());
+
+                //   copy first B-sub
+                threadwise_matrix_copy(b_block_mtx,
+                                       p_b_block + b_block_mtx.Get1dIndex(k_begin, 0) +
+                                           mMyThreadOffsetB,
+                                       b_thread_mtx,
+                                       p_b_thread,
+                                       b_thread_sub_mtx.GetLengths());
+
+                //   do first sub GEMM
+                threadwise_gemm(a_thread_sub_mtx,
+                                True,
+                                p_a_thread,
+                                b_thread_sub_mtx,
+                                False,
+                                p_b_thread,
+                                c_thread_sub_mtx,
+                                False,
+                                p_c_thread,
+                                f_accum);
+
+#pragma unroll
+                //   copy next B-sub, and do GEMM
+                for(unsigned n_repeat = 1; n_repeat < NRepeat; ++n_repeat)
+                {
+                    threadwise_matrix_copy(
+                        b_block_mtx,
+                        p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
+                            mMyThreadOffsetB,
+                        b_thread_mtx,
+                        p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        b_thread_sub_mtx.GetLengths());
+
+                    threadwise_gemm(
+                        a_thread_sub_mtx,
+                        True,
+                        p_a_thread,
+                        b_thread_sub_mtx,
+                        False,
+                        p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        c_thread_sub_mtx,
+                        False,
+                        p_c_thread + c_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                        f_accum);
+                }
+
+#pragma unroll
+                // loop over rest of row-wise subblock
+                //   all B-sub(s) has been copied, so only A-sub(s) need to be copied
+                for(unsigned m_repeat = 1; m_repeat < MRepeat; ++m_repeat)
+                {
+                    // copy a A-sub
+                    threadwise_matrix_copy(
+                        a_block_mtx,
+                        p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
+                            mMyThreadOffsetA,
+                        a_thread_mtx,
+                        p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                        a_thread_sub_mtx.GetLengths());
+
+                    // do some GEMMs
+                    for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
+                    {
+                        threadwise_gemm(
+                            a_thread_sub_mtx,
+                            True,
+                            p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
+                            b_thread_sub_mtx,
+                            False,
+                            p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
+                            c_thread_sub_mtx,
+                            False,
+                            p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
+                                                                 n_repeat * NPerThreadSubC),
+                            f_accum);
+                    }
+                }
+            }
+        }
+    }
 };
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
@@ -201,11 +201,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
    // set threadwise output tensor to 0
    threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);

-    Float* p_in_global_block_begin =
+    const Float* p_in_global_block_begin =
        p_in_global + in_chwn_global_desc.Get1dIndex(
                          0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);

-    Float* p_wei_global_block_begin =
+    const Float* p_wei_global_block_begin =
        p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);

    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
@@ -213,15 +213,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
                 p_wei_global_block_begin += CPerBlock * wei_csrk_global_desc.GetStride(I0),
                 __syncthreads())
    {
-#if 1
        // input: global mem to LDS,
        blockwise_in_copy.Run(p_in_global_block_begin, p_in_block);
-#endif

-#if 1
        // weight: global mem to LDS,
        blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block);
-#endif

        __syncthreads();

--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
@@ -36,11 +36,11 @@ template <unsigned GridSize,
          unsigned WeiBlockCopyDataPerRead>
 __global__ void
 gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
-                                                    Float* const __restrict__ p_in_global,
+                                                    const Float* const __restrict__ p_in_global,
                                                    WeiGlobalDesc,
-                                                    Float* const __restrict__ p_wei_global,
+                                                    const Float* const __restrict__ p_wei_global,
                                                    OutGlobalDesc,
-                                                    Float* __restrict__ p_out_global)
+                                                    Float* const __restrict__ p_out_global)
 {
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
@@ -228,10 +228,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
    // set threadwise output tensor to 0
    threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);

-    Float* p_in_global_block_offset =
+    const Float* p_in_global_block_offset =
        p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);

-    Float* p_wei_global_block_offset =
+    const Float* p_wei_global_block_offset =
        p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);

    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
@@ -256,7 +256,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,

 #if 1
                blockwise_gemm.Run
-#else
+#elif 0
+                blockwise_gemm.Run_v2
+#elif 0
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
                    (p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
--- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
+++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
@@ -123,7 +123,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
                               decltype(in_cb_global_desc),
                               decltype(in_cb_block_desc),
                               decltype(in_cb_block_desc.GetLengths())>{};
-#elif 1
+#elif 0
    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
                                                          Float,
                                                          decltype(in_cb_global_desc),
@@ -149,7 +149,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
                               decltype(wei_ek_global_desc),
                               decltype(wei_ek_block_desc),
                               decltype(wei_ek_block_desc.GetLengths())>{};
-#elif 1
+#elif 0
    const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(wei_ek_global_desc),
@@ -226,10 +226,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
    __shared__ Float p_in_block_1[max_align * ((in_block_size + max_align - 1) / max_align)];
    __shared__ Float p_wei_block_1[max_align * ((wei_block_size + max_align - 1) / max_align)];

-    Float* p_in_global_block_offset =
+    const Float* p_in_global_block_offset =
        p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);

-    Float* p_wei_global_block_offset =
+    const Float* p_wei_global_block_offset =
        p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);

    // preload data into LDS
@@ -272,15 +272,15 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
            for(unsigned r = 0; r < R; ++r)
            {
                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                blockwise_gemm.Run
 #else
                blockwise_gemm.Run_RegisterDoubleBuffer
 #endif
-                (p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
-                 p_in_block_now + s * Wi + r,
-                 p_out_thread,
-                 f_accum);
+                    (p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
+                     p_in_block_now + s * Wi + r,
+                     p_out_thread,
+                     f_accum);
            }
        }
    }
--- a/src/include/threadwise_gemm.cuh
+++ b/src/include/threadwise_gemm.cuh
@@ -1,11 +1,14 @@
 #pragma once

 template <class Float, class SrcMatrix, class DstMatrix, unsigned NRow, unsigned NCol>
-__device__ void
-threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence<NRow, NCol>)
+__device__ void threadwise_matrix_copy(SrcMatrix,
+                                       const Float* __restrict__ p_src,
+                                       DstMatrix,
+                                       Float* __restrict__ p_dst,
+                                       Sequence<NRow, NCol>)
 {
-    const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile
-    const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile
+    constexpr auto src_mtx = SrcMatrix{};
+    constexpr auto dst_mtx = DstMatrix{};

    for(unsigned i = 0; i < NRow; ++i)
    {
@@ -31,30 +34,30 @@ template <class MatrixA,
          class Accumulator>
 __device__ void threadwise_gemm(MatrixA,
                                integral_constant<bool, TransA>,
-                                FloatA* const p_a_thread,
+                                const FloatA* __restrict__ p_a_thread,
                                MatrixB,
                                integral_constant<bool, TransB>,
-                                FloatB* const p_b_thread,
+                                const FloatB* __restrict__ p_b_thread,
                                MatrixC,
                                integral_constant<bool, TransC>,
-                                FloatC* p_c_thread,
+                                FloatC* __restrict__ p_c_thread,
                                Accumulator f_accum)
 {
    if(TransA && (!TransB) && (!TransC))
    {
-        const auto a_mtx = MatrixA{}; // constexpr doesn't compile
-        const auto b_mtx = MatrixB{}; // constexpr doesn't compile
-        const auto c_mtx = MatrixC{}; // constexpr doesn't compile
+        constexpr auto a_mtx = MatrixA{};
+        constexpr auto b_mtx = MatrixB{};
+        constexpr auto c_mtx = MatrixC{};

        constexpr unsigned M = c_mtx.NRow();
        constexpr unsigned N = c_mtx.NCol();
        constexpr unsigned K = a_mtx.NRow(); // A is transposed

-        for(unsigned i = 0; i < M; ++i)
+        for(unsigned k = 0; k < K; ++k)
        {
-            for(unsigned j = 0; j < N; ++j)
+            for(unsigned i = 0; i < M; ++i)
            {
-                for(unsigned k = 0; k < K; ++k)
+                for(unsigned j = 0; j < N; ++j)
                {
                    const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed
                    const unsigned bindex = b_mtx.Get1dIndex(k, j);