From fdb7d416618e4329d3a0dbe33d43cc8d078b5a41 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 17:58:44 -0500
Subject: [PATCH 1/7] cleaning up dead code

[ROCm/composable_kernel commit: bdbc0eaad175cd4054105cfb3fc812a8526e3b49]
---
 ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp |   2 +-
 driver/driver.hip.cpp                         |   2 +-
 {build => script}/cmake-cuda.sh               |   0
 {build => script}/cmake-hip.sh                |   0
 src/include/blockwise_batched_gemm.hip.hpp    | 455 ------------------
 src/include/blockwise_gemm.hip.hpp            | 217 +--------
 6 files changed, 6 insertions(+), 670 deletions(-)
 rename {build => script}/cmake-cuda.sh (100%)
 rename {build => script}/cmake-hip.sh (100%)

diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index 497aa3e9c1..a3489bc8cc 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -221,7 +221,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
 
     constexpr index_t BlockSize = 128;
 #elif 0
-    // 1x1, 14x14, Vega 20, hack CPerBlock = 1
+    // 1x1, 14x14, Vega 20, hack CPerBlock = 1 for debugging
     constexpr index_t BPerBlock = 64;
     constexpr index_t KPerBlock = 128;
     constexpr index_t CPerBlock = 1;
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 0ea091e607..a83e4082c7 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -580,7 +580,7 @@ int main(int argc, char* argv[])
 
     constexpr index_t HPad = 0;
     constexpr index_t WPad = 0;
-#elif 1
+#elif 0
     // 1x1 filter, 14x14 image, C = 2048
     constexpr index_t N  = 128;
     constexpr index_t C  = 2048;
diff --git a/build/cmake-cuda.sh b/script/cmake-cuda.sh
similarity index 100%
rename from build/cmake-cuda.sh
rename to script/cmake-cuda.sh
diff --git a/build/cmake-hip.sh b/script/cmake-hip.sh
similarity index 100%
rename from build/cmake-hip.sh
rename to script/cmake-hip.sh
diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp
index bf2777f140..deba68a261 100644
--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -1,231 +1,6 @@
 #pragma once
 #include "threadwise_gemm.hip.hpp"
 
-template <index_t BlockSize,
-          class BlockMatrixA,
-          class BlockMatrixB,
-          class ThreadMatrixC,
-          bool TransA,
-          bool TransB,
-          bool TransC,
-          index_t BlockMatrixStrideA,
-          index_t BlockMatrixStrideB,
-          index_t ThreadMatrixStrideC,
-          index_t BatchSize,
-          index_t BatchPerThread,
-          index_t KPerThreadLoop,
-          bool DistributeThreadAlongColumnFirst>
-struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
-{
-    index_t mMyThreadOffsetA = 0;
-    index_t mMyThreadOffsetB = 0;
-
-    struct MatrixIndex
-    {
-        index_t batch;
-        index_t row;
-        index_t col;
-    };
-
-    __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC()
-    {
-        constexpr auto a_block_mtx = BlockMatrixA{};
-        constexpr auto b_block_mtx = BlockMatrixB{};
-
-        const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA +
-                           ((!TransA) ? a_block_mtx.Get1dIndex(c_thread_mtx_index.row, 0)
-                                      : a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row));
-
-        mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB +
-                           ((!TransB) ? b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col)
-                                      : b_block_mtx.Get1dIndex(c_thread_mtx_index.col, 0));
-
-#if 0
-        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-        {
-            print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
-            print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
-            print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
-
-            printf("%u %u, %u %u %u, %u %u\n",
-                   get_block_1d_id(),
-                   get_thread_local_1d_id(),
-                   c_thread_mtx_index.batch,
-                   c_thread_mtx_index.row,
-                   c_thread_mtx_index.col,
-                   mMyThreadOffsetA,
-                   mMyThreadOffsetB);
-        }
-#endif
-    }
-
-    __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const
-    {
-
-        if(TransA && (!TransB) && (!TransC))
-        {
-            constexpr auto a_block_mtx = BlockMatrixA{};
-            constexpr auto b_block_mtx = BlockMatrixB{};
-
-            static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
-                          "wrong! k dimension not consistent!");
-
-            constexpr index_t MPerBlock = a_block_mtx.NCol();
-            constexpr index_t NPerBlock = b_block_mtx.NCol();
-
-            constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-            // divide thread work
-            constexpr index_t MPerThread = c_thread_mtx.NRow();
-            constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-            static_assert(BatchSize % BatchPerThread == 0, "BatchSize % BatchPerThread != 0");
-            static_assert(MPerBlock % MPerThread == 0, "MPerBlock % MPerThread != 0");
-            static_assert(NPerBlock % NPerThread == 0, "NPerBlock % NPerThread != 0");
-
-            constexpr index_t BatchThreadWork = (BatchSize + BatchPerThread - 1) / BatchPerThread;
-            constexpr index_t MThreadWork     = (MPerBlock + MPerThread - 1) / MPerThread;
-            constexpr index_t NThreadWork     = (NPerBlock + NPerThread - 1) / NPerThread;
-
-            static_assert(BlockSize == BatchThreadWork * MThreadWork * NThreadWork,
-                          "wrong! wrong BlockSize");
-
-            if(DistributeThreadAlongColumnFirst)
-            {
-                // num of operations can be reduced
-                const index_t b_work_id = thread_id / (MThreadWork * NThreadWork);
-                index_t itmp            = thread_id - b_work_id * (MThreadWork * NThreadWork);
-                const index_t m_work_id = itmp / NThreadWork;
-                const index_t n_work_id = itmp - m_work_id * NThreadWork;
-
-                return MatrixIndex{
-                    b_work_id * BatchPerThread, m_work_id * MPerThread, n_work_id * NPerThread};
-            }
-            else
-            {
-                // not implemented
-                assert(false);
-            }
-        }
-        else
-        {
-            // not implemented
-            assert(false);
-        }
-    }
-
-    // this should be optimized away if input is known
-    __device__ static MatrixIndex
-    GetDistanceFromBeginOfThreadMatrixC(index_t batch_in_c, index_t m_in_c, index_t n_in_c)
-    {
-        return MatrixIndex{batch_in_c, m_in_c, n_in_c};
-    }
-
-    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(const FloatA* __restrict__ p_a_block,
-                        const FloatB* __restrict__ p_b_block,
-                        FloatC* __restrict__ p_c_thread,
-                        Accumulator f_accum) const
-    {
-        if(TransA && (!TransB) && (!TransC))
-        {
-            constexpr auto True  = integral_constant<bool, true>{};
-            constexpr auto False = integral_constant<bool, false>{};
-
-            constexpr auto a_block_mtx  = BlockMatrixA{};
-            constexpr auto b_block_mtx  = BlockMatrixB{};
-            constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-            constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed
-
-            constexpr index_t MPerThread = c_thread_mtx.NRow();
-            constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-            // a is transposed, b is not
-            constexpr auto a_thread_mtx =
-                make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
-
-            constexpr auto b_thread_mtx =
-                make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
-
-            FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
-            FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
-
-            // loop over k
-            for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
-            {
-                // read first batch of a, b
-                threadwise_matrix_copy(a_block_mtx,
-                                       p_a_block + mMyThreadOffsetA +
-                                           k_begin * a_block_mtx.RowStride(),
-                                       a_thread_mtx,
-                                       p_a_thread,
-                                       a_thread_mtx.GetLengths());
-
-                threadwise_matrix_copy(b_block_mtx,
-                                       p_b_block + mMyThreadOffsetB +
-                                           k_begin * b_block_mtx.RowStride(),
-                                       b_thread_mtx,
-                                       p_b_thread,
-                                       b_thread_mtx.GetLengths());
-
-                // loop over batch
-                for(index_t ib = 0; ib + 1 < BatchPerThread; ++ib)
-                {
-                    // do current batch of gemm
-                    threadwise_gemm(a_thread_mtx,
-                                    True,
-                                    p_a_thread,
-                                    b_thread_mtx,
-                                    False,
-                                    p_b_thread,
-                                    c_thread_mtx,
-                                    False,
-                                    p_c_thread + ib * ThreadMatrixStrideC,
-                                    f_accum);
-
-                    // read next batch of a, b
-                    if(BlockMatrixStrideA != 0)
-                    {
-                        threadwise_matrix_copy(a_block_mtx,
-                                               p_a_block + mMyThreadOffsetA +
-                                                   (ib + 1) * BlockMatrixStrideA +
-                                                   +k_begin * a_block_mtx.RowStride(),
-                                               a_thread_mtx,
-                                               p_a_thread,
-                                               a_thread_mtx.GetLengths());
-                    }
-
-                    if(BlockMatrixStrideB != 0)
-                    {
-                        threadwise_matrix_copy(b_block_mtx,
-                                               p_b_block + mMyThreadOffsetB +
-                                                   (ib + 1) * BlockMatrixStrideB +
-                                                   k_begin * b_block_mtx.RowStride(),
-                                               b_thread_mtx,
-                                               p_b_thread,
-                                               b_thread_mtx.GetLengths());
-                    }
-                }
-
-                // do last batch of gemm
-                threadwise_gemm(a_thread_mtx,
-                                True,
-                                p_a_thread,
-                                b_thread_mtx,
-                                False,
-                                p_b_thread,
-                                c_thread_mtx,
-                                False,
-                                p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
-                                f_accum);
-            }
-        }
-    }
-};
-
 template <index_t BlockSize,
           class BlockMatrixA,
           class BlockMatrixB,
@@ -526,236 +301,6 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         }
     }
 
-    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run_v3(const FloatA* __restrict__ p_a_block,
-                           const FloatB* __restrict__ p_b_block,
-                           FloatC* __restrict__ p_c_thread,
-                           Accumulator f_accum) const
-    {
-        constexpr auto True  = integral_constant<bool, true>{};
-        constexpr auto False = integral_constant<bool, false>{};
-
-        constexpr auto a_block_mtx  = BlockMatrixA{};
-        constexpr auto b_block_mtx  = BlockMatrixB{};
-        constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-        constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed
-
-        constexpr index_t MPerThread = c_thread_mtx.NRow();
-        constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-        // thread A, B for GEMM
-        //   A is transposed, b is not
-        constexpr auto a_thread_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
-
-        constexpr auto b_thread_mtx =
-            make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
-
-        // thread A-sub, B-sub for copy
-        constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
-
-        constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
-            Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
-
-        FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
-        FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
-
-        constexpr index_t MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster;
-        constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;
-
-        constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
-        constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
-
-        // loop over k
-        //#pragma unroll
-        for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
-        {
-            // read first batch of A, B
-            //   copy A-sub to form A
-            //#pragma unroll
-            for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
-            {
-                for(index_t i = 0; i < a_thread_sub_mtx.NRow(); ++i)
-                {
-#if 1
-                    for(index_t j = 0; j < a_thread_sub_mtx.NCol(); ++j)
-                    {
-                        p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
-                            p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
-                                                             m_repeat * MPerLevel1Cluster + j) +
-                                      mMyThreadOffsetA];
-                    }
-#else
-                    static_assert(a_thread_sub_mtx.NCol() == 4, "asm only read 4xfp32");
-
-#endif
-                }
-            }
-
-            //   copy B-sub to form B
-            //#pragma unroll
-            for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
-            {
-                for(index_t i = 0; i < b_thread_sub_mtx.NRow(); ++i)
-                {
-                    for(index_t j = 0; j < b_thread_sub_mtx.NCol(); ++j)
-                    {
-                        p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
-                            p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
-                                                             n_repeat * MPerLevel1Cluster + j) +
-                                      mMyThreadOffsetB];
-                    }
-                }
-            }
-
-            // loop over batch
-            //#pragma unroll
-            for(index_t ib = 0; ib + 1 < BatchPerThread; ++ib)
-            {
-                // do current batch of gemm
-                for(index_t k = 0; k < a_thread_mtx.NRow(); ++k)
-                {
-#if 0
-                    for(index_t i = 0; i < c_thread_mtx.NRow(); ++i)
-                    {
-                        for(index_t j = 0; j < c_thread_mtx.NCol(); ++j)
-                        {
-                            const index_t aindex =
-                                a_thread_mtx.Get1dIndex(k, i); // A is transposed
-                            const index_t bindex = b_thread_mtx.Get1dIndex(k, j);
-                            const index_t cindex =
-                                c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC;
-
-                            f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
-                        }
-                    }
-#elif 1
-                    static_assert(c_thread_mtx.NRow() == 16 && c_thread_mtx.NCol() == 4,
-                                  "asm is only for 16x4");
-
-                    const index_t bindex = b_thread_mtx.Get1dIndex(k, 0);
-                    for(index_t i = 0; i < c_thread_mtx.NRow(); ++i)
-                    {
-                        const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
-                        const index_t cindex = c_thread_mtx.Get1dIndex(i, 0);
-
-                        asm volatile("\n \
-                            v_mac_f32 %0, %4, %5 \n \
-                            v_mac_f32 %1, %4, %6 \n \
-                            v_mac_f32 %2, %4, %7 \n \
-                            v_mac_f32 %3, %4, %8 \n \
-                            "
-                                     : "=v"(p_c_thread[cindex + 0]),
-                                       "=v"(p_c_thread[cindex + 1]),
-                                       "=v"(p_c_thread[cindex + 2]),
-                                       "=v"(p_c_thread[cindex + 3])
-                                     : "v"(p_a_thread[aindex]),
-                                       "v"(p_b_thread[bindex + 0]),
-                                       "v"(p_b_thread[bindex + 1]),
-                                       "v"(p_b_thread[bindex + 2]),
-                                       "v"(p_b_thread[bindex + 3]),
-                                       "0"(p_c_thread[cindex + 0]),
-                                       "1"(p_c_thread[cindex + 1]),
-                                       "2"(p_c_thread[cindex + 2]),
-                                       "3"(p_c_thread[cindex + 3]));
-                    }
-#endif
-                }
-
-                // read next batch of a, b
-                if(BlockMatrixStrideA != 0)
-                {
-                    //#pragma unroll
-                    for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
-                    {
-                        for(index_t i = 0; i < a_thread_sub_mtx.NRow(); ++i)
-                        {
-                            for(index_t j = 0; j < a_thread_sub_mtx.NCol(); ++j)
-                            {
-                                p_a_thread[a_thread_mtx.Get1dIndex(i,
-                                                                   m_repeat * MPerThreadSubC + j)] =
-                                    p_a_block[a_block_mtx.Get1dIndex(
-                                                  k_begin + i, m_repeat * MPerLevel1Cluster + j) +
-                                              (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
-                            }
-                        }
-                    }
-                }
-
-                if(BlockMatrixStrideB != 0)
-                {
-                    //#pragma unroll
-                    for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
-                    {
-                        for(index_t i = 0; i < b_thread_sub_mtx.NRow(); ++i)
-                        {
-                            for(index_t j = 0; j < b_thread_sub_mtx.NCol(); ++j)
-                            {
-                                p_b_thread[b_thread_mtx.Get1dIndex(i,
-                                                                   n_repeat * NPerThreadSubC + j)] =
-                                    p_b_block[b_block_mtx.Get1dIndex(
-                                                  k_begin + i, n_repeat * MPerLevel1Cluster + j) +
-                                              (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
-                            }
-                        }
-                    }
-                }
-            }
-
-            // do last batch of gemm
-            for(index_t k = 0; k < a_thread_mtx.NRow(); ++k)
-            {
-#if 0
-                for(index_t i = 0; i < c_thread_mtx.NRow(); ++i)
-                {
-                    for(index_t j = 0; j < c_thread_mtx.NCol(); ++j)
-                    {
-                        const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
-                        const index_t bindex = b_thread_mtx.Get1dIndex(k, j);
-                        const index_t cindex = c_thread_mtx.Get1dIndex(i, j) +
-                                                (BatchPerThread - 1) * ThreadMatrixStrideC;
-
-                        f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
-                    }
-                }
-#elif 1
-                static_assert(c_thread_mtx.NRow() == 16 && c_thread_mtx.NCol() == 4,
-                              "asm is only for 16x4");
-
-                const index_t bindex = b_thread_mtx.Get1dIndex(k, 0);
-                for(index_t i = 0; i < c_thread_mtx.NRow(); ++i)
-                {
-                    const index_t aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
-                    const index_t cindex =
-                        c_thread_mtx.Get1dIndex(i, 0) + (BatchPerThread - 1) * ThreadMatrixStrideC;
-
-                    asm volatile("\n \
-                            v_mac_f32 %0, %4, %5 \n \
-                            v_mac_f32 %1, %4, %6 \n \
-                            v_mac_f32 %2, %4, %7 \n \
-                            v_mac_f32 %3, %4, %8 \n \
-                            "
-                                 : "=v"(p_c_thread[cindex + 0]),
-                                   "=v"(p_c_thread[cindex + 1]),
-                                   "=v"(p_c_thread[cindex + 2]),
-                                   "=v"(p_c_thread[cindex + 3])
-                                 : "v"(p_a_thread[aindex]),
-                                   "v"(p_b_thread[bindex + 0]),
-                                   "v"(p_b_thread[bindex + 1]),
-                                   "v"(p_b_thread[bindex + 2]),
-                                   "v"(p_b_thread[bindex + 3]),
-                                   "0"(p_c_thread[cindex + 0]),
-                                   "1"(p_c_thread[cindex + 1]),
-                                   "2"(p_c_thread[cindex + 2]),
-                                   "3"(p_c_thread[cindex + 3]));
-                }
-#endif
-            }
-        }
-    }
-
     template <class BlockMatrixC, index_t BlockMatrixStrideC, class FloatC>
     __device__ void CopyThreadMatrixCToBlockMatrixC(const FloatC* __restrict__ p_c_thread,
                                                     FloatC* __restrict__ p_c_block) const
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 3e9c57d15f..fee5b704f3 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -3,215 +3,6 @@
 
 extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
 
-template <index_t BlockSize,
-          class BlockMatrixA,
-          class BlockMatrixB,
-          class ThreadMatrixC,
-          bool TransA,
-          bool TransB,
-          bool TransC,
-          index_t KPerThreadLoop,
-          index_t MThreadPerCluster,
-          index_t NThreadPerCluster,
-          bool DistributeThreadAlongColumnFirst>
-struct BlockwiseGemmBlockABlockBThreadC
-{
-    index_t mMyThreadOffsetA = 0;
-    index_t mMyThreadOffsetB = 0;
-
-    struct MatrixIndex
-    {
-        index_t row;
-        index_t col;
-    };
-
-    __device__ BlockwiseGemmBlockABlockBThreadC()
-    {
-        constexpr auto a_block_mtx = BlockMatrixA{};
-        constexpr auto b_block_mtx = BlockMatrixB{};
-
-        const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        mMyThreadOffsetA = (!TransA) ? a_block_mtx.Get1dIndex(c_thread_mtx_index.row, 0)
-                                     : a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row);
-
-        mMyThreadOffsetB = (!TransB) ? b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col)
-                                     : b_block_mtx.Get1dIndex(c_thread_mtx_index.col, 0);
-
-#if 0
-        if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-        {
-            print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
-            print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
-            print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
-
-            printf("%u %u, %u %u %u, %u %u\n",
-                   get_block_1d_id(),
-                   get_thread_local_1d_id(),
-                   c_thread_mtx_index.batch,
-                   c_thread_mtx_index.row,
-                   c_thread_mtx_index.col,
-                   mMyThreadOffsetA,
-                   mMyThreadOffsetB);
-        }
-#endif
-    }
-
-    __device__ MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id) const
-    {
-
-        if(TransA && (!TransB) && (!TransC))
-        {
-            constexpr auto a_block_mtx = BlockMatrixA{};
-            constexpr auto b_block_mtx = BlockMatrixB{};
-
-            static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
-                          "wrong! k dimension not consistent!");
-
-            constexpr index_t MPerBlock = a_block_mtx.NCol();
-            constexpr index_t NPerBlock = b_block_mtx.NCol();
-
-            constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-            // divide thread work
-            constexpr index_t MPerThread = c_thread_mtx.NRow();
-            constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-            static_assert(MPerBlock % (MPerThread * MThreadPerCluster) == 0,
-                          "MPerBlock % (MPerThread * MThreadPerCluster) != 0");
-
-            static_assert(NPerBlock % (NPerThread * NThreadPerCluster) == 0,
-                          "NPerBlock % (NPerThread * NThreadPerCluster) != 0");
-
-            constexpr index_t MClusterWork =
-                (MPerBlock + MPerThread * MThreadPerCluster - 1) / (MPerThread * MThreadPerCluster);
-
-            constexpr index_t NClusterWork =
-                (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
-
-            static_assert(BlockSize ==
-                              (MClusterWork * MThreadPerCluster) *
-                                  (NClusterWork * NThreadPerCluster),
-                          "wrong! wrong BlockSize");
-
-            if(DistributeThreadAlongColumnFirst)
-            {
-                const index_t cluster_work_block_id =
-                    thread_id / (MThreadPerCluster * NThreadPerCluster);
-
-                const index_t thread_work_cluster_id =
-                    thread_id - cluster_work_block_id * (MThreadPerCluster * NThreadPerCluster);
-
-                const index_t m_cluster_work_block_id = cluster_work_block_id / NClusterWork;
-                const index_t n_cluster_work_block_id =
-                    cluster_work_block_id - m_cluster_work_block_id * NClusterWork;
-
-                const index_t m_thread_work_cluster_id = thread_work_cluster_id / NThreadPerCluster;
-                const index_t n_thread_work_cluster_id =
-                    thread_work_cluster_id - m_thread_work_cluster_id * NThreadPerCluster;
-
-#if 0
-                if(get_block_1d_id() == 0)
-                {
-                    printf("%u %u, \t"
-                           "MClusterWork %u MThreadPerCluster %u NClusterWork %u NThreadPerCluster %u \t"
-                           "m_cluster_work_block_id %u n_cluster_work_block_id %u \t"
-                           "m_thread_work_cluster_id %u n_thread_work_cluster_id %u \t"
-                            "\n",
-                            get_block_1d_id(), get_thread_local_1d_id(),
-                            MClusterWork, MThreadPerCluster, NClusterWork, NThreadPerCluster,
-                            m_cluster_work_block_id, n_cluster_work_block_id,
-                            m_thread_work_cluster_id, n_thread_work_cluster_id);
-                }
-#endif
-
-                return MatrixIndex{m_cluster_work_block_id * (MThreadPerCluster * MPerThread) +
-                                       m_thread_work_cluster_id * MPerThread,
-                                   n_cluster_work_block_id * (NThreadPerCluster * NPerThread) +
-                                       n_thread_work_cluster_id * NPerThread};
-            }
-            else
-            {
-                // not implemented
-                assert(false);
-            }
-        }
-        else
-        {
-            // not implemented
-            assert(false);
-        }
-    }
-
-    // this should be optimized away if input is known
-    __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c,
-                                                                      index_t n_in_c)
-    {
-        return MatrixIndex{m_in_c, n_in_c};
-    }
-
-    template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run(const FloatA* __restrict__ p_a_block,
-                        const FloatB* __restrict__ p_b_block,
-                        FloatC* __restrict__ p_c_thread,
-                        Accumulator f_accum) const
-    {
-        if(TransA && (!TransB) && (!TransC))
-        {
-            constexpr auto True  = integral_constant<bool, true>{};
-            constexpr auto False = integral_constant<bool, false>{};
-
-            constexpr auto a_block_mtx  = BlockMatrixA{};
-            constexpr auto b_block_mtx  = BlockMatrixB{};
-            constexpr auto c_thread_mtx = ThreadMatrixC{};
-
-            constexpr index_t KPerBlock = a_block_mtx.NRow(); // A is transposed
-
-            constexpr index_t MPerThread = c_thread_mtx.NRow();
-            constexpr index_t NPerThread = c_thread_mtx.NCol();
-
-            // a is transposed, b is not
-            constexpr auto a_thread_mtx =
-                make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
-
-            constexpr auto b_thread_mtx =
-                make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
-
-            FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
-            FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
-
-            // loop over k
-            for(index_t k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
-            {
-                threadwise_matrix_copy(a_block_mtx,
-                                       p_a_block + mMyThreadOffsetA +
-                                           k_begin * a_block_mtx.RowStride(),
-                                       a_thread_mtx,
-                                       p_a_thread,
-                                       a_thread_mtx.GetLengths());
-
-                threadwise_matrix_copy(b_block_mtx,
-                                       p_b_block + mMyThreadOffsetB +
-                                           k_begin * b_block_mtx.RowStride(),
-                                       b_thread_mtx,
-                                       p_b_thread,
-                                       b_thread_mtx.GetLengths());
-
-                threadwise_gemm(a_thread_mtx,
-                                True,
-                                p_a_thread,
-                                b_thread_mtx,
-                                False,
-                                p_b_thread,
-                                c_thread_mtx,
-                                False,
-                                p_c_thread,
-                                f_accum);
-            }
-        }
-    }
-};
-
 // if following number are power of 2, index calculation shall be greatly reduced:
 //    MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
 template <index_t BlockSize,
@@ -1149,10 +940,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
     }
 
     template <class FloatA, class FloatB, class FloatC, class Accumulator>
-    __device__ void Run_v2(const FloatA* __restrict__ p_a_block,
-                           const FloatB* __restrict__ p_b_block,
-                           FloatC* __restrict__ p_c_thread,
-                           Accumulator f_accum) const
+    __device__ void Run_PipelineReadAndCompute(const FloatA* __restrict__ p_a_block,
+                                               const FloatB* __restrict__ p_b_block,
+                                               FloatC* __restrict__ p_c_thread,
+                                               Accumulator f_accum) const
     {
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};

From fb723b967f2adc8270b5920732c87227c7993ac7 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 19:37:02 -0500
Subject: [PATCH 2/7] puting gridwise convolution into its own class

[ROCm/composable_kernel commit: 0b41ca2d9e20ad41dc522942aaffb2ebc8ef9b92]
---
 ...licit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 12 +++++++-----
 ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp} | 18 +++++++-----------
 .../gridwise_convolution_wrapper.hip.hpp       | 10 ++++++++++
 3 files changed, 24 insertions(+), 16 deletions(-)
 rename src/include/{gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp => gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp} (95%)
 create mode 100644 src/include/gridwise_convolution_wrapper.hip.hpp

diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index a3489bc8cc..3aae266e4c 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -1,8 +1,9 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
-#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
+#include "gridwise_convolution_wrapper.hip.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp"
+//#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
@@ -272,7 +273,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
     {
         constexpr auto gridwise_conv =
 #if 1
-            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
+            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #else
             gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
 #endif
@@ -301,11 +302,12 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
              WeiBlockCopyThreadPerDim0,
              WeiBlockCopyThreadPerDim1,
              InBlockCopyDataPerRead,
-             WeiBlockCopyDataPerRead>();
+             WeiBlockCopyDataPerRead>{};
 
-        float time = launch_kernel(gridwise_conv.Run,
+        float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
                                    dim3(GridSize),
                                    dim3(BlockSize),
+                                   gridwise_conv,
                                    static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
                                    static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
                                    static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
similarity index 95%
rename from src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
rename to src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
index da689bc6b9..5f0d353465 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -34,10 +34,11 @@ template <index_t GridSize,
           index_t WeiBlockCopyThreadPerDim1,
           index_t InBlockCopyDataPerRead,
           index_t WeiBlockCopyDataPerRead>
-class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
+struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 {
-    public:
-    __host__ __device__ static index_t GetSharedMemorySize()
+    __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn() {}
+
+    __host__ __device__ constexpr index_t GetSharedMemoryUsage() const
     {
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
@@ -46,7 +47,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
 
         constexpr auto in_chwn_global_desc  = InGlobalDesc{};
         constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
-        constexpr auto out_khwn_global_desc = OutGlobalDesc{};
 
         constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
         constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
@@ -64,10 +64,6 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
         constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
             Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
 
-        // tensor view of threadwise output in register
-        constexpr auto out_kb_thread_desc =
-            make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
-
         constexpr index_t max_align =
             mod_conv::max(InBlockCopyDataPerRead, WeiBlockCopyDataPerRead);
 
@@ -81,9 +77,9 @@ class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
         return (in_block_element_space + wei_block_element_space) * sizeof(Float);
     }
 
-    __global__ static void Run(const Float* const __restrict__ p_in_global,
-                               const Float* const __restrict__ p_wei_global,
-                               Float* const __restrict__ p_out_global)
+    __device__ void Run(const Float* const __restrict__ p_in_global,
+                        const Float* const __restrict__ p_wei_global,
+                        Float* const __restrict__ p_out_global) const
     {
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
diff --git a/src/include/gridwise_convolution_wrapper.hip.hpp b/src/include/gridwise_convolution_wrapper.hip.hpp
new file mode 100644
index 0000000000..e0abfda3b6
--- /dev/null
+++ b/src/include/gridwise_convolution_wrapper.hip.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+template <class GridwiseConvolution, class T>
+__global__ void run_gridwise_convolution(GridwiseConvolution,
+                                         const T* const __restrict__ p_in_global,
+                                         const T* const __restrict__ p_wei_global,
+                                         T* const __restrict__ p_out_global)
+{
+    GridwiseConvolution{}.Run(p_in_global, p_wei_global, p_out_global);
+}

From e423954e6e6c14271f7250a7c908c8b0e9c12442 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 20:18:01 -0500
Subject: [PATCH 3/7] puting gridwise convolution into its own class

[ROCm/composable_kernel commit: 6290e0b080ce54eca53c871f252ee312f8435a62]
---
 ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp |   4 +-
 src/include/blockwise_gemm.hip.hpp            |   7 +-
 src/include/common.hip.hpp                    |  10 +-
 ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp |   4 +-
 ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 381 ------------------
 src/include/threadwise_gemm.hip.hpp           |   8 +-
 6 files changed, 21 insertions(+), 393 deletions(-)
 delete mode 100644 src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp

diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index 3aae266e4c..9cebb0200a 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -3,7 +3,7 @@
 #include "device.hpp"
 #include "gridwise_convolution_wrapper.hip.hpp"
 #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp"
-//#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
@@ -275,7 +275,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
 #if 1
             GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #else
-            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
+            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #endif
             <GridSize,
              BlockSize,
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index fee5b704f3..a03dd14b23 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -1,8 +1,6 @@
 #pragma once
 #include "threadwise_gemm.hip.hpp"
 
-extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
-
 // if following number are power of 2, index calculation shall be greatly reduced:
 //    MPerThreadSubC, NPerThreadSubC, MLevel0Cluster, NLevel0Cluster, MLevel1Cluster, NLevel1Cluster
 template <index_t BlockSize,
@@ -131,6 +129,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             FloatC* __restrict__ p_c_thread,
                             Accumulator f_accum) const
     {
+#if DEVICE_BACKEND_HIP
         constexpr auto True  = integral_constant<bool, true>{};
         constexpr auto False = integral_constant<bool, false>{};
 
@@ -711,6 +710,10 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            "67"(p_c_thread[63]));
 #endif
         }
+#else
+        printf("asm only support on HIP backend\n");
+        assert(false);
+#endif
     }
 
     template <class FloatA, class FloatB, class FloatC, class Accumulator>
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index ca083ee640..fc0d7d8bc3 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -26,7 +26,7 @@ __host__ __device__ constexpr index_t integer_divide_ceil(index_t a, index_t b)
     return (a + b - 1) / b;
 }
 
-namespace mod_conv {
+namespace mod_conv { // namespace mod_conv
 template <class T>
 __host__ __device__ constexpr T max(T x, T y)
 {
@@ -62,4 +62,10 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 
     return x < y ? x : y;
 }
-}
+}// namespace mod_conv
+
+#if DEVICE_BACKEND_HIP
+// cast a pointer of LDS to its address
+extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
+#endif
+
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
index 5f0d353465..a95363fa60 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -280,9 +280,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
                 for(index_t x = 0; x < X; ++x)
                 {
                     auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 0
+#if 1
                     blockwise_gemm.Run
-#elif 1
+#elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
                     blockwise_gemm.Run_asm
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
deleted file mode 100644
index 488b0a0da7..0000000000
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-#pragma once
-#include "common.hip.hpp"
-#include "ConstantTensorDescriptor.hip.hpp"
-#include "ConstantMatrixDescriptor.hip.hpp"
-#include "blockwise_4d_tensor_op.hip.hpp"
-#include "blockwise_2d_tensor_op.hip.hpp"
-#include "threadwise_2d_tensor_op.hip.hpp"
-#include "blockwise_gemm.hip.hpp"
-
-// define B = flatten(N, Hi, Wi)
-template <index_t GridSize,
-          index_t BlockSize,
-          class Float,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          index_t BPerBlock,
-          index_t KPerBlock,
-          index_t CPerBlock,
-          index_t BPerThread,
-          index_t KPerThread,
-          index_t GemmThreadPerColumnPerCluster,
-          index_t GemmThreadPerRowPerCluster,
-          index_t GemmMPerThreadSubC,
-          index_t GemmNPerThreadSubC,
-          index_t GemmMLevel0Cluster,
-          index_t GemmNLevel0Cluster,
-          index_t GemmMLevel1Cluster,
-          index_t GemmNLevel1Cluster,
-          index_t GemmKPerThreadLoop,
-          index_t InBlockCopyThreadPerDim0,
-          index_t InBlockCopyThreadPerDim1,
-          index_t WeiBlockCopyThreadPerDim0,
-          index_t WeiBlockCopyThreadPerDim1,
-          index_t InBlockCopyDataPerRead,
-          index_t WeiBlockCopyDataPerRead>
-class gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
-{
-    public:
-    __global__ static void Run(const Float* const __restrict__ p_in_global,
-                               const Float* const __restrict__ p_wei_global,
-                               Float* const __restrict__ p_out_global)
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        constexpr auto in_chwn_global_desc  = InGlobalDesc{};
-        constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
-        constexpr auto out_khwn_global_desc = OutGlobalDesc{};
-
-        constexpr index_t C  = in_chwn_global_desc.GetLength(I0);
-        constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
-        constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
-        constexpr index_t N  = in_chwn_global_desc.GetLength(I3);
-
-        constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
-        constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
-        constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);
-
-        constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
-        constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);
-
-        constexpr index_t B          = N * Hi * Wi;
-        constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
-
-        // divide block work by 2d: [K, B]
-        constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
-        constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;
-
-        const index_t k_block_work_id = get_block_1d_id() / BBlockWork;
-        const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
-
-        const index_t k_block_data_begin = k_block_work_id * KPerBlock;
-        const index_t b_block_data_begin = b_block_work_id * BPerBlock;
-
-        // flattend (2d) tensor view of gridwise input
-        constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
-        constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
-
-        // tensor view of blockwise input and weight
-        //   be careful of alignment
-        constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
-
-        constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
-
-        constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
-            Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
-
-        // tensor view of threadwise output in register
-        constexpr auto out_kb_thread_desc =
-            make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
-
-#if 0
-    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-    {
-        print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
-        print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
-        print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
-
-        print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
-        print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
-
-        print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
-        print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
-        print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
-        print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
-
-        printf("KPerBlock %u\n", KPerBlock);
-    }
-#endif
-
-// blockwise in copy
-//   formmat is [CPerBlock,BPerBlock + BGhostRead]
-#if 0
-    const auto blockwise_in_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(in_cb_global_desc),
-                               decltype(in_cb_block_desc),
-                               decltype(in_cb_block_desc.GetLengths())>{};
-#elif 0
-        const auto blockwise_in_copy =
-            Blockwise2dTensorCopy2<BlockSize,
-                                   Float,
-                                   decltype(in_cb_global_desc),
-                                   decltype(in_cb_block_desc),
-                                   decltype(in_cb_block_desc.GetLengths()),
-                                   InBlockCopyThreadPerDim0,
-                                   InBlockCopyThreadPerDim1>{};
-#elif 1
-        const auto blockwise_in_copy =
-            Blockwise2dTensorCopy3<BlockSize,
-                                   Float,
-                                   decltype(in_cb_global_desc),
-                                   decltype(in_cb_block_desc),
-                                   decltype(in_cb_block_desc.GetLengths()),
-                                   InBlockCopyDataPerRead>{};
-#endif
-
-// blockwise wei copy
-//   format is [CPerBlock*Y*X,KPerBlock]
-#if 0
-    const auto blockwise_wei_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(wei_ek_global_desc),
-                               decltype(wei_ek_block_desc),
-                               decltype(wei_ek_block_desc.GetLengths())>{};
-#elif 0
-        const auto blockwise_wei_copy =
-            Blockwise2dTensorCopy2<BlockSize,
-                                   Float,
-                                   decltype(wei_ek_global_desc),
-                                   decltype(wei_ek_block_desc),
-                                   decltype(wei_ek_block_desc.GetLengths()),
-                                   WeiBlockCopyThreadPerDim0,
-                                   WeiBlockCopyThreadPerDim1>{};
-#elif 1
-        const auto blockwise_wei_copy =
-            Blockwise2dTensorCopy3<BlockSize,
-                                   Float,
-                                   decltype(wei_ek_global_desc),
-                                   decltype(wei_ek_block_desc),
-                                   decltype(wei_ek_block_desc.GetLengths()),
-                                   WeiBlockCopyDataPerRead>{};
-#endif
-
-        // a series of blockwise GEMM
-        // c_mtx += transpose(a_mtx) * b_mtx
-        //   a_mtx and b_mtx saved in LDS, c_mtx saved in register
-        //   a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
-        //   b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
-        //   c_mtx[K,B] is out_block[K,B]
-        constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
-            Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
-
-        constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
-            Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
-
-        constexpr auto c_kxb_thread_mtx_desc =
-            make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
-
-#if 0
-    const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
-                                                                 decltype(a_cxk_block_mtx_desc),
-                                                                 decltype(b_cxb_block_mtx_desc),
-                                                                 decltype(c_kxb_thread_mtx_desc),
-                                                                 true,
-                                                                 false,
-                                                                 false,
-                                                                 GemmKPerThreadLoop,
-                                                                 GemmThreadPerColumnPerCluster,
-                                                                 GemmThreadPerRowPerCluster,
-                                                                 true>{};
-#else
-        const auto blockwise_gemm =
-            BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
-                                                                    decltype(a_cxk_block_mtx_desc),
-                                                                    decltype(b_cxb_block_mtx_desc),
-                                                                    decltype(c_kxb_thread_mtx_desc),
-                                                                    GemmMPerThreadSubC,
-                                                                    GemmNPerThreadSubC,
-                                                                    GemmMLevel0Cluster,
-                                                                    GemmNLevel0Cluster,
-                                                                    GemmMLevel1Cluster,
-                                                                    GemmNLevel1Cluster,
-                                                                    GemmKPerThreadLoop>{};
-#endif
-
-        // LDS: be careful of alignment
-        constexpr index_t in_block_element_size =
-            in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
-
-        constexpr index_t wei_block_element_size =
-            wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
-
-        constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
-                                          ? InBlockCopyDataPerRead
-                                          : WeiBlockCopyDataPerRead;
-
-        // LDS double buffer
-        __shared__ Float
-            p_in_block_0[max_align * ((in_block_element_size + max_align - 1) / max_align)];
-        __shared__ Float
-            p_wei_block_0[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
-
-        __shared__ Float
-            p_in_block_1[max_align * ((in_block_element_size + max_align - 1) / max_align)];
-        __shared__ Float
-            p_wei_block_1[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
-
-        const Float* p_in_global_block_offset =
-            p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
-
-        const Float* p_wei_global_block_offset =
-            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
-
-        // preload data into LDS
-        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0);
-        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0);
-
-        p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
-        p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
-
-        // register
-        Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
-
-        // set threadwise output tensor to 0
-        threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
-
-        bool even_loop = true;
-
-        for(index_t c_block_data_begin = 0; c_block_data_begin + CPerBlock < C;
-            c_block_data_begin += CPerBlock,
-                    p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
-                    p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
-                    even_loop = !even_loop)
-        {
-            Float* p_in_block_now  = even_loop ? p_in_block_0 : p_in_block_1;
-            Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
-
-            Float* p_in_block_next  = even_loop ? p_in_block_1 : p_in_block_0;
-            Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
-
-            __syncthreads();
-
-// load next data
-#if 0
-        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
-        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
-#elif 1
-            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
-            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
-
-            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
-                                                       p_in_register_clipboard);
-
-            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
-                                                        p_wei_register_clipboard);
-#endif
-
-            // compute on current data
-            //   a series of GEMM
-            for(index_t y = 0; y < Y; ++y)
-            {
-                for(index_t x = 0; x < X; ++x)
-                {
-                    auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 1
-                    blockwise_gemm.Run
-#else
-                    blockwise_gemm.Run_RegisterDoubleBuffer
-#endif
-                        (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                         p_in_block_now + y * Wi + x,
-                         p_out_thread,
-                         f_accum);
-                }
-            }
-
-#if 1
-            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
-            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
-                                                         p_wei_block_next);
-#endif
-        }
-
-        // last computation
-        {
-            Float* p_in_block_now  = even_loop ? p_in_block_0 : p_in_block_1;
-            Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
-
-            __syncthreads();
-
-            for(index_t y = 0; y < Y; ++y)
-            {
-                for(index_t x = 0; x < X; ++x)
-                {
-                    auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
-#if 1
-                    blockwise_gemm.Run
-#else
-                    blockwise_gemm.Run_RegisterDoubleBuffer
-#endif
-                        (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                         p_in_block_now + y * Wi + x,
-                         p_out_thread,
-                         f_accum);
-                }
-            }
-        }
-
-        // output: register to global mem,
-        const auto c_thread_mtx_begin =
-            blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
-
-        const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
-        const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
-
-#if 0
-    if(get_block_1d_id() == 0)
-    {
-        printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
-               get_block_1d_id(),
-               get_thread_local_1d_id(),
-               matrix_c_index.row,
-               matrix_c_index.col,
-               k_data_begin,
-               b_data_begin,
-               p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
-    }
-#endif
-
-        for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
-        {
-            for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
-            {
-                const auto c_thread_mtx_distance =
-                    blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
-
-                index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row;
-                index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col;
-
-                index_t h_data = b_data / (Wi * N);
-                index_t itmp   = b_data - h_data * (Wi * N);
-                index_t w_data = itmp / N;
-                index_t n_data = itmp - w_data * N;
-
-                if(n_data < N && h_data < Ho && w_data < Wo)
-                {
-                    p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
-                        p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
-                }
-            }
-        }
-    }
-};
diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp
index d1c7e830d0..c81fd82cd3 100644
--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
     constexpr auto src_mtx = SrcMatrix{};
     constexpr auto dst_mtx = DstMatrix{};
 
-#if 0
+#if 1
     for(index_t i = 0; i < NRow; ++i)
     {
         for(index_t j = 0; j < NCol; ++j)
@@ -78,9 +78,7 @@ __device__ void threadwise_gemm(MatrixA,
                     const index_t bindex = b_mtx.Get1dIndex(k, j);
                     const index_t cindex = c_mtx.Get1dIndex(i, j);
 
-#if 0
-                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
-#elif 1
+#if DEVICE_BACKEND_HIP // this only does c += a * b
                     asm volatile("\n \
                                 v_mac_f32 %0, %1, %2 \n \
                                 "
@@ -88,6 +86,8 @@ __device__ void threadwise_gemm(MatrixA,
                                  : "v"(p_a_thread[aindex]),
                                    "v"(p_b_thread[bindex]),
                                    "0"(p_c_thread[cindex]));
+#else // this does general accumulation defined by f_accum
+                    f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
 #endif
                 }
             }

From 0b2a76553db5b256c24dae21cd4c25dc172b4eac Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 20:19:24 -0500
Subject: [PATCH 4/7] add a missing file

[ROCm/composable_kernel commit: c23474104508d5b08b2548365751993701a2ec73]
---
 ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 382 ++++++++++++++++++
 1 file changed, 382 insertions(+)
 create mode 100644 src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp

diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
new file mode 100644
index 0000000000..a731964bc6
--- /dev/null
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -0,0 +1,382 @@
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "ConstantMatrixDescriptor.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
+#include "threadwise_2d_tensor_op.hip.hpp"
+#include "blockwise_gemm.hip.hpp"
+
+// define B = flatten(N, Hi, Wi)
+template <index_t GridSize,
+          index_t BlockSize,
+          class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          index_t BPerBlock,
+          index_t KPerBlock,
+          index_t CPerBlock,
+          index_t BPerThread,
+          index_t KPerThread,
+          index_t GemmThreadPerColumnPerCluster,
+          index_t GemmThreadPerRowPerCluster,
+          index_t GemmMPerThreadSubC,
+          index_t GemmNPerThreadSubC,
+          index_t GemmMLevel0Cluster,
+          index_t GemmNLevel0Cluster,
+          index_t GemmMLevel1Cluster,
+          index_t GemmNLevel1Cluster,
+          index_t GemmKPerThreadLoop,
+          index_t InBlockCopyThreadPerDim0,
+          index_t InBlockCopyThreadPerDim1,
+          index_t WeiBlockCopyThreadPerDim0,
+          index_t WeiBlockCopyThreadPerDim1,
+          index_t InBlockCopyDataPerRead,
+          index_t WeiBlockCopyDataPerRead>
+struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
+{
+    __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() {}
+
+    __device__ void Run(const Float* const __restrict__ p_in_global,
+                               const Float* const __restrict__ p_wei_global,
+                               Float* const __restrict__ p_out_global) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto in_chwn_global_desc  = InGlobalDesc{};
+        constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
+        constexpr auto out_khwn_global_desc = OutGlobalDesc{};
+
+        constexpr index_t C  = in_chwn_global_desc.GetLength(I0);
+        constexpr index_t Hi = in_chwn_global_desc.GetLength(I1);
+        constexpr index_t Wi = in_chwn_global_desc.GetLength(I2);
+        constexpr index_t N  = in_chwn_global_desc.GetLength(I3);
+
+        constexpr index_t K  = out_khwn_global_desc.GetLength(I0);
+        constexpr index_t Ho = out_khwn_global_desc.GetLength(I1);
+        constexpr index_t Wo = out_khwn_global_desc.GetLength(I2);
+
+        constexpr index_t Y = wei_cyxk_global_desc.GetLength(I1);
+        constexpr index_t X = wei_cyxk_global_desc.GetLength(I2);
+
+        constexpr index_t B          = N * Hi * Wi;
+        constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
+
+        // divide block work by 2d: [K, B]
+        constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+        constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock;
+
+        const index_t k_block_work_id = get_block_1d_id() / BBlockWork;
+        const index_t b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
+
+        const index_t k_block_data_begin = k_block_work_id * KPerBlock;
+        const index_t b_block_data_begin = b_block_work_id * BPerBlock;
+
+        // flattend (2d) tensor view of gridwise input
+        constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
+        constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
+
+        // tensor view of blockwise input and weight
+        //   be careful of alignment
+        constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
+
+        constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+        constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+        // tensor view of threadwise output in register
+        constexpr auto out_kb_thread_desc =
+            make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
+
+#if 0
+    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+    {
+        print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
+        print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
+        print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
+        print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
+        print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
+
+        printf("KPerBlock %u\n", KPerBlock);
+    }
+#endif
+
+// blockwise in copy
+//   formmat is [CPerBlock,BPerBlock + BGhostRead]
+#if 0
+    const auto blockwise_in_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(in_cb_global_desc),
+                               decltype(in_cb_block_desc),
+                               decltype(in_cb_block_desc.GetLengths())>{};
+#elif 0
+        const auto blockwise_in_copy =
+            Blockwise2dTensorCopy2<BlockSize,
+                                   Float,
+                                   decltype(in_cb_global_desc),
+                                   decltype(in_cb_block_desc),
+                                   decltype(in_cb_block_desc.GetLengths()),
+                                   InBlockCopyThreadPerDim0,
+                                   InBlockCopyThreadPerDim1>{};
+#elif 1
+        const auto blockwise_in_copy =
+            Blockwise2dTensorCopy3<BlockSize,
+                                   Float,
+                                   decltype(in_cb_global_desc),
+                                   decltype(in_cb_block_desc),
+                                   decltype(in_cb_block_desc.GetLengths()),
+                                   InBlockCopyDataPerRead>{};
+#endif
+
+// blockwise wei copy
+//   format is [CPerBlock*Y*X,KPerBlock]
+#if 0
+    const auto blockwise_wei_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(wei_ek_global_desc),
+                               decltype(wei_ek_block_desc),
+                               decltype(wei_ek_block_desc.GetLengths())>{};
+#elif 0
+        const auto blockwise_wei_copy =
+            Blockwise2dTensorCopy2<BlockSize,
+                                   Float,
+                                   decltype(wei_ek_global_desc),
+                                   decltype(wei_ek_block_desc),
+                                   decltype(wei_ek_block_desc.GetLengths()),
+                                   WeiBlockCopyThreadPerDim0,
+                                   WeiBlockCopyThreadPerDim1>{};
+#elif 1
+        const auto blockwise_wei_copy =
+            Blockwise2dTensorCopy3<BlockSize,
+                                   Float,
+                                   decltype(wei_ek_global_desc),
+                                   decltype(wei_ek_block_desc),
+                                   decltype(wei_ek_block_desc.GetLengths()),
+                                   WeiBlockCopyDataPerRead>{};
+#endif
+
+        // a series of blockwise GEMM
+        // c_mtx += transpose(a_mtx) * b_mtx
+        //   a_mtx and b_mtx saved in LDS, c_mtx saved in register
+        //   a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
+        //   b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
+        //   c_mtx[K,B] is out_block[K,B]
+        constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
+            Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
+
+        constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
+            Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
+
+        constexpr auto c_kxb_thread_mtx_desc =
+            make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
+
+#if 0
+    const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
+                                                                 decltype(a_cxk_block_mtx_desc),
+                                                                 decltype(b_cxb_block_mtx_desc),
+                                                                 decltype(c_kxb_thread_mtx_desc),
+                                                                 true,
+                                                                 false,
+                                                                 false,
+                                                                 GemmKPerThreadLoop,
+                                                                 GemmThreadPerColumnPerCluster,
+                                                                 GemmThreadPerRowPerCluster,
+                                                                 true>{};
+#else
+        const auto blockwise_gemm =
+            BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
+                                                                    decltype(a_cxk_block_mtx_desc),
+                                                                    decltype(b_cxb_block_mtx_desc),
+                                                                    decltype(c_kxb_thread_mtx_desc),
+                                                                    GemmMPerThreadSubC,
+                                                                    GemmNPerThreadSubC,
+                                                                    GemmMLevel0Cluster,
+                                                                    GemmNLevel0Cluster,
+                                                                    GemmMLevel1Cluster,
+                                                                    GemmNLevel1Cluster,
+                                                                    GemmKPerThreadLoop>{};
+#endif
+
+        // LDS: be careful of alignment
+        constexpr index_t in_block_element_size =
+            in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+
+        constexpr index_t wei_block_element_size =
+            wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+        constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                          ? InBlockCopyDataPerRead
+                                          : WeiBlockCopyDataPerRead;
+
+        // LDS double buffer
+        __shared__ Float
+            p_in_block_0[max_align * ((in_block_element_size + max_align - 1) / max_align)];
+        __shared__ Float
+            p_wei_block_0[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
+
+        __shared__ Float
+            p_in_block_1[max_align * ((in_block_element_size + max_align - 1) / max_align)];
+        __shared__ Float
+            p_wei_block_1[max_align * ((wei_block_element_size + max_align - 1) / max_align)];
+
+        const Float* p_in_global_block_offset =
+            p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
+
+        const Float* p_wei_global_block_offset =
+            p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+
+        // preload data into LDS
+        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0);
+        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0);
+
+        p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
+        p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
+
+        // register
+        Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
+
+        // set threadwise output tensor to 0
+        threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
+
+        bool even_loop = true;
+
+        for(index_t c_block_data_begin = 0; c_block_data_begin + CPerBlock < C;
+            c_block_data_begin += CPerBlock,
+                    p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
+                    p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                    even_loop = !even_loop)
+        {
+            Float* p_in_block_now  = even_loop ? p_in_block_0 : p_in_block_1;
+            Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
+
+            Float* p_in_block_next  = even_loop ? p_in_block_1 : p_in_block_0;
+            Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
+
+            __syncthreads();
+
+// load next data
+#if 0
+        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
+        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
+#elif 1
+            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
+            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
+
+            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
+                                                       p_in_register_clipboard);
+
+            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
+                                                        p_wei_register_clipboard);
+#endif
+
+            // compute on current data
+            //   a series of GEMM
+            for(index_t y = 0; y < Y; ++y)
+            {
+                for(index_t x = 0; x < X; ++x)
+                {
+                    auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
+#if 1
+                    blockwise_gemm.Run
+#else
+                    blockwise_gemm.Run_RegisterDoubleBuffer
+#endif
+                        (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block_now + y * Wi + x,
+                         p_out_thread,
+                         f_accum);
+                }
+            }
+
+#if 1
+            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
+            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
+                                                         p_wei_block_next);
+#endif
+        }
+
+        // last computation
+        {
+            Float* p_in_block_now  = even_loop ? p_in_block_0 : p_in_block_1;
+            Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
+
+            __syncthreads();
+
+            for(index_t y = 0; y < Y; ++y)
+            {
+                for(index_t x = 0; x < X; ++x)
+                {
+                    auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
+#if 1
+                    blockwise_gemm.Run
+#else
+                    blockwise_gemm.Run_RegisterDoubleBuffer
+#endif
+                        (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block_now + y * Wi + x,
+                         p_out_thread,
+                         f_accum);
+                }
+            }
+        }
+
+        // output: register to global mem,
+        const auto c_thread_mtx_begin =
+            blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
+        const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
+
+#if 0
+    if(get_block_1d_id() == 0)
+    {
+        printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
+               get_block_1d_id(),
+               get_thread_local_1d_id(),
+               matrix_c_index.row,
+               matrix_c_index.col,
+               k_data_begin,
+               b_data_begin,
+               p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
+    }
+#endif
+
+        for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
+        {
+            for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
+            {
+                const auto c_thread_mtx_distance =
+                    blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
+
+                index_t k_data = k_thread_data_begin + c_thread_mtx_distance.row;
+                index_t b_data = b_thread_data_begin + c_thread_mtx_distance.col;
+
+                index_t h_data = b_data / (Wi * N);
+                index_t itmp   = b_data - h_data * (Wi * N);
+                index_t w_data = itmp / N;
+                index_t n_data = itmp - w_data * N;
+
+                if(n_data < N && h_data < Ho && w_data < Wo)
+                {
+                    p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
+                        p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
+                }
+            }
+        }
+    }
+};

From 9d61f2597a855a4f94a3752bd309ca6929a5004c Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 20:26:58 -0500
Subject: [PATCH 5/7] add cuda extract_asm script

[ROCm/composable_kernel commit: e6c86f81b54a1aef7c80108549496281a6d56a8d]
---
 script/cmake-cuda.sh       | 9 +++++----
 script/extract_asm-cuda.sh | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100755 script/extract_asm-cuda.sh

diff --git a/script/cmake-cuda.sh b/script/cmake-cuda.sh
index 0110075c0d..0e7d00c469 100755
--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
@@ -15,11 +15,12 @@ cmake
 -D DEVICE_BACKEND=CUDA                                                                      \
 -D BOOST_ROOT="/package/install/boost_1.67.0"                                               \
 -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"  \
--D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
+-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
 ${MY_PROJECT_SOURCE}
 
 
 #-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc"                                \
-#-D CMAKE_CUDA_FLAGS="-G -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61"            \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61" \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -arch=sm_61 -Xptxas -v -maxrregcount=128" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \
diff --git a/script/extract_asm-cuda.sh b/script/extract_asm-cuda.sh
new file mode 100755
index 0000000000..84a0af76dd
--- /dev/null
+++ b/script/extract_asm-cuda.sh
@@ -0,0 +1 @@
+cuobjdump -xelf all ./driver/driver && nvdisasm --print-code -g driver.sm_61.cubin > driver.sm_61.asm && nvdisasm --print-code -g driver.sm_70.cubin > driver.sm_70.asm

From e277457dceec95938d394eb3b0a571210c238ef5 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 2 Apr 2019 20:30:00 -0500
Subject: [PATCH 6/7] tidy up

[ROCm/composable_kernel commit: e2313c9eca16d1ef503ea4e3ae1ebc437dca2e9f]
---
 src/include/common.hip.hpp                               | 3 +--
 ...e_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 8 ++++----
 ...icit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 9 ++++++---
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index fc0d7d8bc3..6b8c450021 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -62,10 +62,9 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 
     return x < y ? x : y;
 }
-}// namespace mod_conv
+} // namespace mod_conv
 
 #if DEVICE_BACKEND_HIP
 // cast a pointer of LDS to its address
 extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
 #endif
-
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
index a95363fa60..e04b1752ea 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
@@ -287,10 +287,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 0
                     blockwise_gemm.Run_asm
 #endif
-                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
-                     p_in_block + y * Wi + x,
-                     p_out_thread,
-                     f_accum);
+                        (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                         p_in_block + y * Wi + x,
+                         p_out_thread,
+                         f_accum);
                 }
             }
         }
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index a731964bc6..b68b73a6ad 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -36,11 +36,14 @@ template <index_t GridSize,
           index_t WeiBlockCopyDataPerRead>
 struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 {
-    __host__ __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer() {}
+    __host__
+        __device__ constexpr GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer()
+    {
+    }
 
     __device__ void Run(const Float* const __restrict__ p_in_global,
-                               const Float* const __restrict__ p_wei_global,
-                               Float* const __restrict__ p_out_global) const
+                        const Float* const __restrict__ p_wei_global,
+                        Float* const __restrict__ p_out_global) const
     {
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};

From 770c141f35ee9f10f827058503c58d6aed3067f0 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Wed, 3 Apr 2019 10:36:18 -0500
Subject: [PATCH 7/7] add script to extrac asm on hip

[ROCm/composable_kernel commit: 6166233e05b821e55bcca2bba2d9adad16347f82]
---
 script/compile-hip.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100755 script/compile-hip.sh

diff --git a/script/compile-hip.sh b/script/compile-hip.sh
new file mode 100755
index 0000000000..4c91e8a293
--- /dev/null
+++ b/script/compile-hip.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export KMDUMPISA=1
+export KMDUMPLLVM=1
+
+make -j driver
+/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.isa