From dabfa77fc68bdbcfc2d10cdd515605c053cf1ed0 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 5 Apr 2019 02:13:29 -0500
Subject: [PATCH] clipboard float4 copy and paste C++ code

---
 ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp |   7 +-
 src/include/blockwise_2d_tensor_op.hip.hpp    | 131 ++++++++++++-
 src/include/data_type.hip.hpp                 |  12 +-
 ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 180 +++++++-----------
 src/include/inline_asm.hpp                    |  46 +++--
 src/include/threadwise_gemm.hip.hpp           |  25 ++-
 6 files changed, 246 insertions(+), 155 deletions(-)
diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index 1492d22a16..0612052e00 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -238,9 +238,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
     constexpr index_t GemmNLevel1Cluster = 4;
     constexpr index_t GemmKPerThreadLoop = 1;
 
-    constexpr index_t GemmThreadPerColumnPerCluster = 8;
-    constexpr index_t GemmThreadPerRowPerCluster    = 8;
-
     constexpr index_t InBlockCopyThreadPerDim0 = 4;
     constexpr index_t InBlockCopyThreadPerDim1 = 16;
 
@@ -272,7 +269,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
     for(index_t i = 0; i < nrepeat; ++i)
     {
         constexpr auto gridwise_conv =
-#if 1
+#if 0
             GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #else
             GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
@@ -288,8 +285,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
              CPerBlock,
              BPerThread,
              KPerThread,
-             GemmThreadPerColumnPerCluster,
-             GemmThreadPerRowPerCluster,
              GemmMPerThreadSubC,
              GemmNPerThreadSubC,
              GemmMLevel0Cluster,
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index 5a29f94712..2bbf0edd30 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "ConstantTensorDescriptor.hip.hpp"
+#include "inline_asm.hpp"
 
 template <index_t BlockSize, class Float, class DstDesc, class F>
 __device__ void
@@ -517,9 +518,9 @@ struct Blockwise2dTensorCopy3
         constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
 
         auto f_copy = [&](index_t iloop) {
-            *(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
-                *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
-                                                    iloop * src_loop_stride));
+            *(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
+                *(reinterpret_cast<const vector_t*>(
+                    &p_src[mSrcMyThreadOffset + iloop * src_loop_stride]));
         };
 
         for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
@@ -568,8 +569,8 @@ struct Blockwise2dTensorCopy3
         constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
 
         auto f_copy = [&](index_t iloop) {
-            *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
-                *(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
+            *(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride])) =
+                *(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]));
         };
 
         for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
@@ -589,4 +590,124 @@ struct Blockwise2dTensorCopy3
             }
         }
     }
+
+#if DEVICE_BACKEND_HIP
+    __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
+                                                 Float* p_clipboard) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);
+
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;
+
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;
+
+        if(BlockSize > num_active_thread)
+        {
+            if(get_thread_local_1d_id() >= num_active_thread)
+            {
+                return;
+            }
+        }
+
+        constexpr index_t nloop_d0 = L0 / thread_per_d0;
+
+        constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
+
+        auto f_copy = [&](index_t iloop) {
+#if 0
+            *(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
+                *(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
+                                                    iloop * src_loop_stride]));
+#else
+            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+                          "global_load is only for float4");
+
+            global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
+                        reinterpret_cast<const vector_t*>(
+                            &p_src[mSrcMyThreadOffset + iloop * src_loop_stride]));
+#endif
+        };
+
+        for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
+        {
+            f_copy(iloop);
+        }
+
+        constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0);
+
+        if(has_tail_d0)
+        {
+            constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;
+
+            if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
+            {
+                f_copy(nloop_d0);
+            }
+        }
+    }
+
+    __device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
+                                                  Float* __restrict__ p_dst) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t L0 = CopyLengths{}.Get(I0);
+        constexpr index_t L1 = CopyLengths{}.Get(I1);
+
+        constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
+        constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;
+
+        constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;
+
+        if(BlockSize > num_active_thread)
+        {
+            if(get_thread_local_1d_id() >= num_active_thread)
+            {
+                return;
+            }
+        }
+
+        constexpr index_t nloop_d0 = L0 / thread_per_d0;
+
+        constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
+        constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
+
+        auto f_copy = [&](index_t iloop) {
+#if 0
+            *(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
+                *(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
+#else
+            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+                          "ds_write_b128 is only for float4");
+
+            ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),
+                          &p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]);
+#endif
+        };
+
+        for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
+        {
+            f_copy(iloop);
+        }
+
+        constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0);
+
+        if(has_tail_d0)
+        {
+            constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;
+
+            if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
+            {
+                f_copy(nloop_d0);
+            }
+        }
+    }
+#endif
 };
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index 1261e19989..20d9d91fa5 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -9,13 +9,13 @@ struct vector_type
 template <>
 struct vector_type<float, 1>
 {
-    using MemoryType = float;
+    typedef float MemoryType;
 };
 
 template <>
 struct vector_type<float, 2>
 {
-    using MemoryType = float2;
+    typedef float MemoryType __attribute__((ext_vector_type(2)));
 
     __host__ __device__ static MemoryType Pack(float s0, float s1)
     {
@@ -34,13 +34,7 @@ struct vector_type<float, 2>
 template <>
 struct vector_type<float, 4>
 {
-    using MemoryType = float4;
-};
-
-template <>
-struct vector_type<float2, 2>
-{
-    using MemoryType = float4;
+    typedef float MemoryType __attribute__((ext_vector_type(4)));
 };
 
 #if 0
diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index b9de731b23..be4185a6ab 100644
--- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -19,8 +19,6 @@ template <index_t GridSize,
           index_t CPerBlock,
           index_t BPerThread,
           index_t KPerThread,
-          index_t GemmThreadPerColumnPerCluster,
-          index_t GemmThreadPerRowPerCluster,
           index_t GemmMPerThreadSubC,
           index_t GemmNPerThreadSubC,
           index_t GemmMLevel0Cluster,
@@ -98,34 +96,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
         constexpr auto out_kb_thread_desc =
             make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
 
-#if 0
-    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
-    {
-        print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
-        print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
-        print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
-
-        print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
-        print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
-
-        print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
-        print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
-        print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
-        print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
-
-        printf("KPerBlock %u\n", KPerBlock);
-    }
-#endif
-
 // blockwise in copy
 //   formmat is [CPerBlock,BPerBlock + BGhostRead]
 #if 0
-    const auto blockwise_in_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(in_cb_global_desc),
-                               decltype(in_cb_block_desc),
-                               decltype(in_cb_block_desc.GetLengths())>{};
+        const auto blockwise_in_copy =
+            Blockwise2dTensorCopy1<BlockSize,
+                                   Float,
+                                   decltype(in_cb_global_desc),
+                                   decltype(in_cb_block_desc),
+                                   decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
         const auto blockwise_in_copy =
             Blockwise2dTensorCopy2<BlockSize,
@@ -148,12 +127,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 // blockwise wei copy
 //   format is [CPerBlock*Y*X,KPerBlock]
 #if 0
-    const auto blockwise_wei_copy =
-        Blockwise2dTensorCopy1<BlockSize,
-                               Float,
-                               decltype(wei_ek_global_desc),
-                               decltype(wei_ek_block_desc),
-                               decltype(wei_ek_block_desc.GetLengths())>{};
+        const auto blockwise_wei_copy =
+            Blockwise2dTensorCopy1<BlockSize,
+                                   Float,
+                                   decltype(wei_ek_global_desc),
+                                   decltype(wei_ek_block_desc),
+                                   decltype(wei_ek_block_desc.GetLengths())>{};
 #elif 0
         const auto blockwise_wei_copy =
             Blockwise2dTensorCopy2<BlockSize,
@@ -188,19 +167,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
         constexpr auto c_kxb_thread_mtx_desc =
             make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
 
-#if 0
-    const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
-                                                                 decltype(a_cxk_block_mtx_desc),
-                                                                 decltype(b_cxb_block_mtx_desc),
-                                                                 decltype(c_kxb_thread_mtx_desc),
-                                                                 true,
-                                                                 false,
-                                                                 false,
-                                                                 GemmKPerThreadLoop,
-                                                                 GemmThreadPerColumnPerCluster,
-                                                                 GemmThreadPerRowPerCluster,
-                                                                 true>{};
-#else
         const auto blockwise_gemm =
             BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
                                                                     decltype(a_cxk_block_mtx_desc),
@@ -213,7 +179,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                                                                     GemmMLevel1Cluster,
                                                                     GemmNLevel1Cluster,
                                                                     GemmKPerThreadLoop>{};
-#endif
 
         // LDS: be careful of alignment
         constexpr index_t max_align =
@@ -235,27 +200,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
         const Float* p_wei_global_block_offset =
             p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
 
-// preload data into LDS
-#if 0
-        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0);
-        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0);
-#else
-        Float4 tmp_in, tmp_wei;
-        Float4* glb_in_p =
-            (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
-        Float4* glb_wei_p =
-            (Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
-
-        global_load(tmp_in, glb_in_p);
-        global_load(tmp_wei, glb_wei_p);
-
-        Float4* loc_in_p  = (Float4*)(p_in_block_double + blockwise_in_copy.mDstMyThreadOffset);
-        Float4* loc_wei_p = (Float4*)(p_wei_block_double + blockwise_wei_copy.mDstMyThreadOffset);
-
-        vmcnt(0);
-        ds_write_b128(tmp_in, loc_in_p);
-        ds_write_b128(tmp_wei, loc_wei_p);
-#endif
+        // preload data into LDS
+        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
+        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
 
         // register
         Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
@@ -285,18 +232,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                 p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
 
 // load next data
-#if 0
-            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
-            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
+#if 1
+                Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
+                Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
 
-            __syncthreads();
+                __syncthreads();
 
-            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
-                                                       p_in_register_clipboard);
+                blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
+                                                           p_in_register_clipboard);
 
-            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
-                                                        p_wei_register_clipboard);
-#elif 1
+                blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
+                                                            p_wei_register_clipboard);
+#elif 0
                 Float4 tmp_in, tmp_wei;
                 Float4* glb_in_p =
                     (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
@@ -316,7 +263,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                     for(index_t x = 0; x < X; ++x)
                     {
 #if 0
-                    blockwise_gemm.Run
+                        blockwise_gemm.Run
 #elif 0
                         blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
@@ -328,11 +275,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                     }
                 }
 
-#if 0
-            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
-            blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
-                                                         p_wei_block_next);
-#elif 1
+#if 1
+                blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
+                                                            p_in_block_next);
+                blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
+                                                             p_wei_block_next);
+#elif 0
+                // if work with RunLoadRegisterClipboard_asm, need to wait
+                vmcnt(0);
+
+                blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
+                                                                p_in_block_next);
+                blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
+                                                                 p_wei_block_next);
+#elif 0
                 Float4* loc_in_p =
                     (Float4*)(p_in_block_next + blockwise_in_copy.mDstMyThreadOffset);
                 Float4* loc_wei_p =
@@ -352,16 +308,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
             p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
             p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
 
-            Float4 tmp_in, tmp_wei;
-            Float4* glb_in_p =
-                (Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
-            Float4* glb_wei_p =
-                (Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
-
             __syncthreads();
 
-            global_load(tmp_in, glb_in_p);
-            global_load(tmp_wei, glb_wei_p);
+            Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
+            Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
+
+            blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
+                                                       p_in_register_clipboard);
+
+            blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
+                                                        p_wei_register_clipboard);
 
             for(index_t y = 0; y < Y; ++y)
             {
@@ -369,10 +325,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                 {
 #if 0
                     blockwise_gemm.Run
-#elif 1
-                    blockwise_gemm.Run_asm
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
+#elif 1
+                    blockwise_gemm.Run_asm
 #endif
                     (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
                      p_in_block_double + y * Wi + x,
@@ -380,14 +336,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                 }
             }
 
-            Float4* loc_in_p = (Float4*)(p_in_block_double + in_block_element_space +
-                                         blockwise_in_copy.mDstMyThreadOffset);
-            Float4* loc_wei_p = (Float4*)(p_wei_block_double + wei_block_element_space +
-                                          blockwise_wei_copy.mDstMyThreadOffset);
+#if 1
+            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
+                                                        p_in_block_double + in_block_element_space);
 
+            blockwise_wei_copy.RunStoreRegisterClipboard(
+                p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
+#else
+            // if work with RunLoadRegisterClipboard_asm, need to wait
             vmcnt(0);
-            ds_write_b128(tmp_in, loc_in_p);
-            ds_write_b128(tmp_wei, loc_wei_p);
+
+            blockwise_in_copy.RunStoreRegisterClipboard_asm(
+                p_in_register_clipboard, p_in_block_double + in_block_element_space);
+
+            blockwise_wei_copy.RunStoreRegisterClipboard_asm(
+                p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
+#endif
 
             // odd
             __syncthreads();
@@ -398,10 +362,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
                 {
 #if 0
                     blockwise_gemm.Run
-#elif 1
-                    blockwise_gemm.Run_asm
 #elif 0
                     blockwise_gemm.Run_RegisterDoubleBuffer
+#elif 1
+                    blockwise_gemm.Run_asm
 #endif
                     (p_wei_block_double + in_block_element_space +
                          wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
@@ -423,20 +387,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
         const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
         const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
 
-#if 0
-    if(get_block_1d_id() == 0)
-    {
-        printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
-               get_block_1d_id(),
-               get_thread_local_1d_id(),
-               matrix_c_index.row,
-               matrix_c_index.col,
-               k_data_begin,
-               b_data_begin,
-               p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
-    }
-#endif
-
         for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
         {
             for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
diff --git a/src/include/inline_asm.hpp b/src/include/inline_asm.hpp
index b53d6ed316..f7b1f217cb 100644
--- a/src/include/inline_asm.hpp
+++ b/src/include/inline_asm.hpp
@@ -10,7 +10,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
 #define NO_DS_WRITE 0
 #define NO_GLB_READ 0
 
-inline __device__ void vmcnt(int cnt)
+inline __device__ void vmcnt(index_t cnt)
 {
 #if !NO_VM_WAIT
     if(cnt == 0)
@@ -39,12 +39,12 @@ inline __device__ void vmcnt(int cnt)
     }
     else
     {
-        assert(0);
+        assert(false);
     }
 #endif
 }
 
-inline __device__ void lgkmcnt(int cnt)
+inline __device__ void lgkmcnt(index_t cnt)
 {
 #if !NO_LGKM_WAIT
     if(cnt == 0)
@@ -79,7 +79,7 @@ inline __device__ void lgkmcnt(int cnt)
     }
     else
     {
-        assert(0);
+        assert(false);
     }
 #endif
 }
@@ -187,7 +187,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
     outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
 }
 
-inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
+inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_READ
     if(offset == 0)
@@ -408,29 +408,43 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
     }
     else
     {
-        assert(0);
+        assert(false);
     }
 #endif
 }
 
-inline __device__ void global_load(Float4& r, Float4* ptr)
+inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
 {
 #if !NO_GLB_READ
-    asm volatile("\n \
-			global_load_dwordx4 %0, %1, off \n \
-			"
-                 : "=v"(r)
-                 : "v"(ptr));
+    if(offset == 0)
+    {
+        asm volatile("\n \
+                global_load_dwordx4 %0, %1, off \n \
+                "
+                     : "=v"(r)
+                     : "v"(ptr));
+    }
+    else
+    {
+        assert(false);
+    }
 #endif
 }
 
-inline __device__ void ds_write_b128(Float4& r, void* lds, int offset = 0)
+inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
 {
 #if !NO_DS_WRITE
-    asm volatile("\n \
+    if(offset == 0)
+    {
+        asm volatile("\n \
             ds_write_b128 %0, %1 \n \
             "
-                 :
-                 : "v"(__to_local(lds)), "v"(r));
+                     :
+                     : "v"(__to_local(lds)), "v"(r));
+    }
+    else
+    {
+        assert(false);
+    }
 #endif
 }
diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp
index 79fc1bf699..0797fe0b8c 100644
--- a/src/include/threadwise_gemm.hip.hpp
+++ b/src/include/threadwise_gemm.hip.hpp
@@ -14,12 +14,29 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
 
     for(index_t i = 0; i < NRow; ++i)
     {
-        for(index_t j = 0; j < NCol; ++j)
+        // optimize for vector-4 load
+        if(NCol % 4 == 0)
         {
-            const index_t src_index = src_mtx.Get1dIndex(i, j);
-            const index_t dst_index = dst_mtx.Get1dIndex(i, j);
+            using vector_t = typename vector_type<Float, 4>::MemoryType;
 
-            p_dst[dst_index] = p_src[src_index];
+            for(index_t j = 0; j < NCol / 4; ++j)
+            {
+                const index_t src_index = src_mtx.Get1dIndex(i, 4 * j);
+                const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j);
+
+                *reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
+                    *reinterpret_cast<const vector_t*>(&p_src[src_index]);
+            }
+        }
+        else
+        {
+            for(index_t j = 0; j < NCol; ++j)
+            {
+                const index_t src_index = src_mtx.Get1dIndex(i, j);
+                const index_t dst_index = dst_mtx.Get1dIndex(i, j);
+
+                p_dst[dst_index] = p_src[src_index];
+            }
         }
     }
 }