From fd8de384170d6100a837b19e37139665c89e2054 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sat, 16 Mar 2019 10:50:46 -0500
Subject: [PATCH 01/13] refactor

---
 ...e_direct_convolution_2_nchw_kcyx_nkhw.hpp} |  48 +--
 ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp |  71 +++--
 driver/driver.hip.cpp                         |  19 +-
 ...rect_convolution_2_nchw_kcyx_nkhw.hip.hpp} |   7 +-
 ..._gemm_convolution_2_chwn_cyxk_khwn.hip.hpp | 281 ++++++++++++++++++
 ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp |   4 +-
 6 files changed, 359 insertions(+), 71 deletions(-)
 rename driver/{device_direct_convolution_2.hpp => device_direct_convolution_2_nchw_kcyx_nkhw.hpp} (77%)
 rename src/include/{gridwise_direct_convolution_2.hip.hpp => gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp} (96%)
 create mode 100644 src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
diff --git a/driver/device_direct_convolution_2.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
similarity index 77%
rename from driver/device_direct_convolution_2.hpp
rename to driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
index 1baedafc46..602702949e 100644
--- a/driver/device_direct_convolution_2.hpp
+++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
@@ -1,16 +1,16 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_direct_convolution_2.hip.hpp"
+#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_direct_convolution_2(InDesc,
-                                 const Tensor<T>& in,
-                                 WeiDesc,
-                                 const Tensor<T>& wei,
-                                 OutDesc,
-                                 Tensor<T>& out,
-                                 unsigned nrepeat)
+void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
+                                                const Tensor<T>& in,
+                                                WeiDesc,
+                                                const Tensor<T>& wei,
+                                                OutDesc,
+                                                Tensor<T>& out,
+                                                unsigned nrepeat)
 {
     std::size_t data_sz = sizeof(T);
     DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
@@ -57,22 +57,22 @@ void device_direct_convolution_2(InDesc,
 
     for(unsigned i = 0; i < nrepeat; ++i)
     {
-        float time = launch_kernel(gridwise_direct_convolution_2<T,
-                                                                 InDesc,
-                                                                 WeiDesc,
-                                                                 OutDesc,
-                                                                 NPerBlock,
-                                                                 KPerBlock,
-                                                                 CPerBlock,
-                                                                 HoPerBlock,
-                                                                 WoPerBlock,
-                                                                 NPerThread,
-                                                                 KPerThread,
-                                                                 CPerThread,
-                                                                 HoPerThread,
-                                                                 WoPerThread,
-                                                                 BlockSize,
-                                                                 GridSize>,
+        float time = launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
+                                                                                InDesc,
+                                                                                WeiDesc,
+                                                                                OutDesc,
+                                                                                NPerBlock,
+                                                                                KPerBlock,
+                                                                                CPerBlock,
+                                                                                HoPerBlock,
+                                                                                WoPerBlock,
+                                                                                NPerThread,
+                                                                                KPerThread,
+                                                                                CPerThread,
+                                                                                HoPerThread,
+                                                                                WoPerThread,
+                                                                                BlockSize,
+                                                                                GridSize>,
                                    dim3(GridSize),
                                    dim3(BlockSize),
                                    static_cast<T*>(in_device_buf.GetDeviceBuffer()),
diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index a657949f35..c885894165 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
+#include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp"
 #include "gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
@@ -209,39 +210,43 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
 
     for(unsigned i = 0; i < nrepeat; ++i)
     {
-        float time =
-            launch_kernel(gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer<
-                              GridSize,
-                              BlockSize,
-                              T,
-                              decltype(in_chwn_desc),
-                              decltype(wei_cyxk_desc),
-                              decltype(out_khwn_desc),
-                              BPerBlock,
-                              KPerBlock,
-                              CPerBlock,
-                              BPerThread,
-                              KPerThread,
-                              GemmThreadPerColumnPerCluster,
-                              GemmThreadPerRowPerCluster,
-                              GemmMPerThreadSubC,
-                              GemmNPerThreadSubC,
-                              GemmMLevel0Cluster,
-                              GemmNLevel0Cluster,
-                              GemmMLevel1Cluster,
-                              GemmNLevel1Cluster,
-                              GemmKPerThreadLoop,
-                              InBlockCopyThreadPerDim0,
-                              InBlockCopyThreadPerDim1,
-                              WeiBlockCopyThreadPerDim0,
-                              WeiBlockCopyThreadPerDim1,
-                              InBlockCopyDataPerRead,
-                              WeiBlockCopyDataPerRead>,
-                          dim3(GridSize),
-                          dim3(BlockSize),
-                          static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
-                          static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
-                          static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
+        float time = launch_kernel(
+#if 1
+            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
+#else
+            gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_chwn_desc),
+             decltype(wei_cyxk_desc),
+             decltype(out_khwn_desc),
+             BPerBlock,
+             KPerBlock,
+             CPerBlock,
+             BPerThread,
+             KPerThread,
+             GemmThreadPerColumnPerCluster,
+             GemmThreadPerRowPerCluster,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             InBlockCopyThreadPerDim0,
+             InBlockCopyThreadPerDim1,
+             WeiBlockCopyThreadPerDim0,
+             WeiBlockCopyThreadPerDim1,
+             InBlockCopyDataPerRead,
+             WeiBlockCopyDataPerRead>,
+            dim3(GridSize),
+            dim3(BlockSize),
+            static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
 
         printf("Elapsed time : %f ms\n", time);
         usleep(std::min(time * 1000, float(10000)));
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index aca345acfd..6cd75afd79 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -8,7 +8,7 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "conv_common.hip.hpp"
 #include "device_direct_convolution_1.hpp"
-#include "device_direct_convolution_2.hpp"
+#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
 #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
 #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
 #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
@@ -503,7 +503,7 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
-#elif 0
+#elif 1
     // 1x1 filter, 28x28 image
     constexpr unsigned N  = 16;
     constexpr unsigned C  = 256;
@@ -577,10 +577,11 @@ int main(int argc, char* argv[])
     ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
     ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
 
-    Tensor<half> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<half> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<half> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<half> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
+    using Float = float;
+    Tensor<Float> in_nchw(make_TensorDescriptor(in_nchw_desc));
+    Tensor<Float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<Float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
+    Tensor<Float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
 
     std::size_t num_thread = std::thread::hardware_concurrency();
 
@@ -610,9 +611,9 @@ int main(int argc, char* argv[])
 #if 1
 #if 0
     device_direct_convolution_1
-#elif 0
-    device_direct_convolution_2
 #elif 1
+    device_direct_convolution_2_nchw_kcyx_nkhw
+#elif 0
     device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 0
     device_implicit_gemm_convolution_2_chwn_cyxk_khwn
@@ -633,7 +634,7 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-#if 0
+#if 1
         if(Y == 3 && X == 3)
         {
             host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
diff --git a/src/include/gridwise_direct_convolution_2.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
similarity index 96%
rename from src/include/gridwise_direct_convolution_2.hip.hpp
rename to src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index 13f9e6cf1d..322d5fd9c2 100644
--- a/src/include/gridwise_direct_convolution_2.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -22,9 +22,10 @@ template <class Float,
           unsigned WoPerThread,
           unsigned BlockSize,
           unsigned GridSize>
-__global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_in_global,
-                                              const Float* const __restrict__ p_wei_global,
-                                              Float* const __restrict__ p_out_global)
+__global__ void
+gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
+                                             const Float* const __restrict__ p_wei_global,
+                                             Float* const __restrict__ p_out_global)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
new file mode 100644
index 0000000000..afa3d3ee90
--- /dev/null
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -0,0 +1,281 @@
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "ConstantMatrixDescriptor.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
+#include "threadwise_2d_tensor_op.hip.hpp"
+#include "blockwise_gemm.hip.hpp"
+
+// define B = flatten(N, Hi, Wi)
+template <unsigned GridSize,
+          unsigned BlockSize,
+          class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          unsigned BPerBlock,
+          unsigned KPerBlock,
+          unsigned CPerBlock,
+          unsigned BPerThread,
+          unsigned KPerThread,
+          unsigned GemmThreadPerColumnPerCluster,
+          unsigned GemmThreadPerRowPerCluster,
+          unsigned GemmMPerThreadSubC,
+          unsigned GemmNPerThreadSubC,
+          unsigned GemmMLevel0Cluster,
+          unsigned GemmNLevel0Cluster,
+          unsigned GemmMLevel1Cluster,
+          unsigned GemmNLevel1Cluster,
+          unsigned GemmKPerThreadLoop,
+          unsigned InBlockCopyThreadPerDim0,
+          unsigned InBlockCopyThreadPerDim1,
+          unsigned WeiBlockCopyThreadPerDim0,
+          unsigned WeiBlockCopyThreadPerDim1,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead>
+__global__ void
+gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restrict__ p_in_global,
+                                                    const Float* const __restrict__ p_wei_global,
+                                                    Float* const __restrict__ p_out_global)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_chwn_global_desc  = InGlobalDesc{};
+    constexpr auto wei_cyxk_global_desc = WeiGlobalDesc{};
+    constexpr auto out_khwn_global_desc = OutGlobalDesc{};
+
+    constexpr unsigned C  = in_chwn_global_desc.GetLength(I0);
+    constexpr unsigned Hi = in_chwn_global_desc.GetLength(I1);
+    constexpr unsigned Wi = in_chwn_global_desc.GetLength(I2);
+    constexpr unsigned N  = in_chwn_global_desc.GetLength(I3);
+
+    constexpr unsigned K  = out_khwn_global_desc.GetLength(I0);
+    constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
+    constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
+
+    constexpr unsigned Y = wei_cyxk_global_desc.GetLength(I1);
+    constexpr unsigned X = wei_cyxk_global_desc.GetLength(I2);
+
+    constexpr unsigned B          = N * Hi * Wi;
+    constexpr unsigned BGhostRead = (Y - 1) * Wi + (X - 1);
+
+    // divide block work by 2d: [K, B]
+    constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
+
+    const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
+    const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
+
+    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
+    const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
+
+    // flattend (2d) tensor view of gridwise input
+    constexpr auto in_cb_global_desc  = make_ConstantTensorDescriptor(Sequence<C, B>{});
+    constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * Y * X, K>{});
+
+    // tensor view of blockwise input and weight
+    //   be careful of alignment
+    constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock, BPerBlock + BGhostRead>{}, Number<InBlockCopyDataPerRead>{});
+
+    constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock * Y * X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr auto wei_cyxk_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<CPerBlock, Y, X, KPerBlock>{}, Number<WeiBlockCopyDataPerRead>{});
+
+    // tensor view of threadwise output in register
+    constexpr auto out_kb_thread_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
+
+#if 0
+    if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
+    {
+        print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
+        print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
+        print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
+
+        print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
+        print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
+        print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
+        print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
+
+        printf("KPerBlock %u\n", KPerBlock);
+    }
+#endif
+
+// blockwise in copy
+//   formmat is [CPerBlock,BPerBlock + BGhostRead]
+#if 0
+    const auto blockwise_in_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(in_cb_global_desc),
+                               decltype(in_cb_block_desc),
+                               decltype(in_cb_block_desc.GetLengths())>{};
+#elif 0
+    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+                                                          Float,
+                                                          decltype(in_cb_global_desc),
+                                                          decltype(in_cb_block_desc),
+                                                          decltype(in_cb_block_desc.GetLengths()),
+                                                          InBlockCopyThreadPerDim0,
+                                                          InBlockCopyThreadPerDim1>{};
+#elif 1
+    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+                                                          Float,
+                                                          decltype(in_cb_global_desc),
+                                                          decltype(in_cb_block_desc),
+                                                          decltype(in_cb_block_desc.GetLengths()),
+                                                          InBlockCopyDataPerRead>{};
+#endif
+
+// blockwise wei copy
+//   format is [CPerBlock*Y*X,KPerBlock]
+#if 0
+    const auto blockwise_wei_copy =
+        Blockwise2dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(wei_ek_global_desc),
+                               decltype(wei_ek_block_desc),
+                               decltype(wei_ek_block_desc.GetLengths())>{};
+#elif 0
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy2<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ek_global_desc),
+                                                           decltype(wei_ek_block_desc),
+                                                           decltype(wei_ek_block_desc.GetLengths()),
+                                                           WeiBlockCopyThreadPerDim0,
+                                                           WeiBlockCopyThreadPerDim1>{};
+#elif 1
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ek_global_desc),
+                                                           decltype(wei_ek_block_desc),
+                                                           decltype(wei_ek_block_desc.GetLengths()),
+                                                           WeiBlockCopyDataPerRead>{};
+#endif
+
+    // a series of blockwise GEMM
+    // c_mtx += transpose(a_mtx) * b_mtx
+    //   a_mtx and b_mtx saved in LDS, c_mtx saved in register
+    //   a_mtx[C,K] is a sub-matrix of wei_block[C,Y,X,K]
+    //   b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
+    //   c_mtx[K,B] is out_block[K,B]
+    constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
+        Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_cyxk_block_desc.GetStride(I0)>{});
+
+    constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
+        Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
+
+    constexpr auto c_kxb_thread_mtx_desc =
+        make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
+
+    const auto blockwise_gemm =
+        BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
+                                                                decltype(a_cxk_block_mtx_desc),
+                                                                decltype(b_cxb_block_mtx_desc),
+                                                                decltype(c_kxb_thread_mtx_desc),
+                                                                GemmMPerThreadSubC,
+                                                                GemmNPerThreadSubC,
+                                                                GemmMLevel0Cluster,
+                                                                GemmNLevel0Cluster,
+                                                                GemmMLevel1Cluster,
+                                                                GemmNLevel1Cluster,
+                                                                GemmKPerThreadLoop>{};
+
+    // LDS: be careful of alignment
+    constexpr unsigned in_block_size =
+        in_cb_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+
+    constexpr unsigned wei_block_size =
+        wei_cyxk_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    // LDS
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+
+    const Float* p_in_global_block_offset =
+        p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
+
+    const Float* p_wei_global_block_offset =
+        p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
+
+    // register
+    Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
+
+    // set threadwise output tensor to 0
+    threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
+
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
+                 p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
+                 p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0),
+                 __syncthreads())
+    {
+        // load data
+        blockwise_in_copy.Run(p_in_global_block_offset, p_in_block);
+        blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block);
+
+        __syncthreads();
+
+        // compute on current data
+        //   a series of GEMM
+        for(unsigned y = 0; y < Y; ++y)
+        {
+            for(unsigned x = 0; x < X; ++x)
+            {
+                auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
+#if 1
+                blockwise_gemm.Run
+#else
+                blockwise_gemm.Run_RegisterDoubleBuffer
+#endif
+                    (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
+                     p_in_block + y * Wi + x,
+                     p_out_thread,
+                     f_accum);
+            }
+        }
+    }
+
+    // output: register to global mem,
+    const auto c_thread_mtx_begin =
+        blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+    const unsigned k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
+    const unsigned b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
+
+    for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
+    {
+        for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
+        {
+            const auto c_thread_mtx_distance =
+                blockwise_gemm.GetDistanceFromBeginOfThreadMatrixC(k, b);
+
+            unsigned k_data = k_thread_data_begin + c_thread_mtx_distance.row;
+            unsigned b_data = b_thread_data_begin + c_thread_mtx_distance.col;
+
+            unsigned h_data = b_data / (Wi * N);
+            unsigned itmp   = b_data - h_data * (Wi * N);
+            unsigned w_data = itmp / N;
+            unsigned n_data = itmp - w_data * N;
+
+            if(n_data < N && h_data < Ho && w_data < Wo)
+            {
+                p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] =
+                    p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
+            }
+        }
+    }
+}
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index 7c802266d8..60d827293b 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
         __syncthreads();
 
 // load next data
-#if 1
+#if 0
         blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
         blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
 #elif 1
@@ -292,7 +292,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
             }
         }
 
-#if 0
+#if 1
         blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
         blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next);
 #endif

From a0584426ff5b6b8b448c971b97c9b1a4d86ba010 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sun, 17 Mar 2019 03:22:41 -0500
Subject: [PATCH 02/13] refactoring ConstantTensorDescriptor

---
 ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp |  49 +--
 ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp |   2 +-
 src/include/Array.hip.hpp                     |  18 +
 src/include/ConstantTensorDescriptor.hip.hpp  | 323 +++---------------
 src/include/Sequence.hip.hpp                  |  92 +++++
 src/include/common.hip.hpp                    |  52 +--
 src/include/constant_integral.hip.hpp         |  12 +
 src/include/functional.hip.hpp                |  49 +++
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 195 +++++++++++
 9 files changed, 452 insertions(+), 340 deletions(-)
 create mode 100644 src/include/Array.hip.hpp
 create mode 100644 src/include/Sequence.hip.hpp
 create mode 100644 src/include/constant_integral.hip.hpp
 create mode 100644 src/include/functional.hip.hpp
 create mode 100644 src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp

diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
index 602702949e..d91757dc8f 100644
--- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
@@ -2,6 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
@@ -57,27 +58,33 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
 
     for(unsigned i = 0; i < nrepeat; ++i)
     {
-        float time = launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
-                                                                                InDesc,
-                                                                                WeiDesc,
-                                                                                OutDesc,
-                                                                                NPerBlock,
-                                                                                KPerBlock,
-                                                                                CPerBlock,
-                                                                                HoPerBlock,
-                                                                                WoPerBlock,
-                                                                                NPerThread,
-                                                                                KPerThread,
-                                                                                CPerThread,
-                                                                                HoPerThread,
-                                                                                WoPerThread,
-                                                                                BlockSize,
-                                                                                GridSize>,
-                                   dim3(GridSize),
-                                   dim3(BlockSize),
-                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+        float time = launch_kernel(
+#if 0 
+            gridwise_direct_convolution_2_nchw_kcyx_nkhw
+#else
+            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
+#endif
+            <T,
+             InDesc,
+             WeiDesc,
+             OutDesc,
+             NPerBlock,
+             KPerBlock,
+             CPerBlock,
+             HoPerBlock,
+             WoPerBlock,
+             NPerThread,
+             KPerThread,
+             CPerThread,
+             HoPerThread,
+             WoPerThread,
+             BlockSize,
+             GridSize>,
+            dim3(GridSize),
+            dim3(BlockSize),
+            static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(out_device_buf.GetDeviceBuffer()));
 
         printf("Elapsed time : %f ms\n", time);
         usleep(std::min(time * 1000, float(10000)));
diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
index c885894165..3edd8253dd 100644
--- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp
@@ -211,7 +211,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
     for(unsigned i = 0; i < nrepeat; ++i)
     {
         float time = launch_kernel(
-#if 1
+#if 0
             gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn
 #else
             gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer
diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp
new file mode 100644
index 0000000000..1caab6a4c9
--- /dev/null
+++ b/src/include/Array.hip.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+template <class TData, unsigned NSize>
+struct Array
+{
+    using Type = Array<TData, NSize>;
+
+    static constexpr unsigned nSize = NSize;
+
+    unsigned mData[nSize];
+
+    template <class... Xs>
+    __host__ __device__ Array(Xs... xs) : mData({static_cast<TData>(xs)...})
+    {
+    }
+
+    __host__ __device__ TData operator[](unsigned i) const { return mData[i]; }
+};
diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp
index 2352b0f50c..2e5d237e81 100644
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
@@ -65,8 +65,8 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence<L0
 template <class Lengths, class Strides>
 struct ConstantTensorDescriptor
 {
+    using Type                     = ConstantTensorDescriptor<Lengths, Strides>;
     static constexpr unsigned nDim = Lengths::nDim;
-    using NDimConstant             = Number<nDim>;
 
     __host__ __device__ constexpr ConstantTensorDescriptor()
     {
@@ -91,293 +91,70 @@ struct ConstantTensorDescriptor
         return Strides{}.Get(Number<I>{});
     }
 
+    // c++14 doesn't support constexpr lambdas, has to use this trick instead
+    struct GetElementSize_f
+    {
+        template <class IDim>
+        __host__ __device__ constexpr unsigned operator()(IDim idim) const
+        {
+            return Type{}.GetLength(idim);
+        }
+    };
+
     __host__ __device__ constexpr unsigned GetElementSize() const
     {
-        static_assert(nDim >= 2 && nDim <= 8, "nDim");
-
-        if(nDim == 2)
+        // c++14 doesn't support constexpr lambdas, has to use this trick instead
+        struct multiply
         {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
+            __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
+            {
+                return a * b;
+            }
+        };
 
-            return GetLength(I0) * GetLength(I1);
-        }
-        else if(nDim == 3)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2);
-        }
-        else if(nDim == 4)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3);
-        }
-        else if(nDim == 5)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4);
-        }
-        else if(nDim == 6)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5);
-        }
-        else if(nDim == 7)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5) * GetLength(I6);
-        }
-        else if(nDim == 8)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-            constexpr auto I7 = Number<7>{};
-
-            return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3) * GetLength(I4) *
-                   GetLength(I5) * GetLength(I6) * GetLength(I7);
-        }
-        else
-        {
-            assert(false);
-        }
+        return static_const_reduce_n<nDim>{}(GetElementSize_f{}, multiply{});
     }
 
+    // c++14 doesn't support constexpr lambdas, has to use this trick instead
+    struct GetElementSpace_f
+    {
+        template <class IDim>
+        __host__ __device__ constexpr unsigned operator()(IDim idim) const
+        {
+            return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim);
+        }
+    };
+
     template <class Align = Number<1>>
     __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const
     {
-        static_assert(nDim >= 2 && nDim <= 8, "nDim");
-
-        constexpr unsigned align_size = align.Get();
-
-        if(nDim == 2)
+        // c++14 doesn't support constexpr lambdas, has to use this trick instead
+        struct add
         {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
+            __host__ __device__ constexpr unsigned operator()(unsigned a, unsigned b) const
+            {
+                return a + b;
+            }
+        };
 
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   align_size;
-        }
-        else if(nDim == 3)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + align_size;
-        }
-        else if(nDim == 4)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   align_size;
-        }
-        else if(nDim == 5)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + align_size;
-        }
-        else if(nDim == 6)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   align_size;
-        }
-        else if(nDim == 7)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   (GetLength(I6) - 1) * GetStride(I6) + align_size;
-        }
-        else if(nDim == 8)
-        {
-            constexpr auto I0 = Number<0>{};
-            constexpr auto I1 = Number<1>{};
-            constexpr auto I2 = Number<2>{};
-            constexpr auto I3 = Number<3>{};
-            constexpr auto I4 = Number<4>{};
-            constexpr auto I5 = Number<5>{};
-            constexpr auto I6 = Number<6>{};
-            constexpr auto I7 = Number<7>{};
-
-            return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
-                   (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) +
-                   (GetLength(I4) - 1) * GetStride(I4) + (GetLength(I5) - 1) * GetStride(I5) +
-                   (GetLength(I6) - 1) * GetStride(I6) + (GetLength(I7) - 1) * GetStride(I7) +
-                   align_size;
-        }
+        return static_const_reduce_n<nDim>{}(GetElementSpace_f{}, add{}) + align.Get();
     }
 
-    // this is ugly, only for 2d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1) const
+    template <class... Is>
+    __host__ __device__ unsigned Get1dIndex(Is... is) const
     {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
+        static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong");
 
-        static_assert(nDim == 2, "nDim is not 2");
-        return i0 * GetStride(I0) + i1 * GetStride(I1);
-    }
+        const auto multi_id = Array<unsigned, nDim>(is...);
 
-    // this is ugly, only for 3d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0, unsigned i1, unsigned i2) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
+        unsigned id = 0;
 
-        static_assert(nDim == 3, "nDim is not 3");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2);
-    }
+        static_loop_n<nDim>{}([&](auto IDim) {
+            constexpr unsigned idim = IDim.Get();
+            id += multi_id[idim] * GetStride(IDim);
+        });
 
-    // this is ugly, only for 4d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-
-        static_assert(nDim == 4, "nDim is not 4");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3);
-    }
-
-    // this is ugly, only for 5d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-
-        static_assert(nDim == 5, "nDim is not 5");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4);
-    }
-
-    // this is ugly, only for 6d
-    __host__ __device__ unsigned
-    Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3, unsigned i4, unsigned i5) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
-
-        static_assert(nDim == 6, "nDim is not 6");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5);
-    }
-
-    // this is ugly, only for 7d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0,
-                                            unsigned i1,
-                                            unsigned i2,
-                                            unsigned i3,
-                                            unsigned i4,
-                                            unsigned i5,
-                                            unsigned i6) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-
-        static_assert(nDim == 7, "nDim is not 7");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6);
-    }
-
-    // this is ugly, only for 8d
-    __host__ __device__ unsigned Get1dIndex(unsigned i0,
-                                            unsigned i1,
-                                            unsigned i2,
-                                            unsigned i3,
-                                            unsigned i4,
-                                            unsigned i5,
-                                            unsigned i6,
-                                            unsigned i7) const
-    {
-        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-        constexpr auto I3 = Number<3>{};
-        constexpr auto I4 = Number<4>{};
-        constexpr auto I5 = Number<5>{};
-        constexpr auto I6 = Number<6>{};
-        constexpr auto I7 = Number<7>{};
-
-        static_assert(nDim == 8, "nDim is not 8");
-        return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3) +
-               i4 * GetStride(I4) + i5 * GetStride(I5) + i6 * GetStride(I6) + i7 * GetStride(I7);
+        return id;
     }
 
     __host__ __device__ constexpr auto Condense() const
@@ -385,6 +162,12 @@ struct ConstantTensorDescriptor
         constexpr auto default_strides = calculate_default_strides(Lengths{});
         return ConstantTensorDescriptor<Lengths, decltype(default_strides)>{};
     }
+
+    template <unsigned IDim, unsigned NVector>
+    __host__ __device__ constexpr auto Vectorize(Number<IDim>, Number<NVector>) const
+    {
+        assert(false); // not implemented
+    }
 };
 
 template <class Lengths>
diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp
new file mode 100644
index 0000000000..c8ca7a0f24
--- /dev/null
+++ b/src/include/Sequence.hip.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#include "constant_integral.hip.hpp"
+#include "functional.hip.hpp"
+
+template <unsigned... Is>
+struct Sequence
+{
+    using Type = Sequence<Is...>;
+
+    static constexpr unsigned nDim = sizeof...(Is);
+
+    const unsigned mData[nDim] = {Is...};
+
+    template <unsigned I>
+    __host__ __device__ constexpr unsigned Get(Number<I>) const
+    {
+        return mData[I];
+    }
+
+    // this is ugly, only for nDIm = 4
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
+    {
+        static_assert(nDim == 4, "nDim != 4");
+
+        constexpr auto old_sequence = Type{};
+
+        constexpr unsigned NR0 = old_sequence.mData[I0];
+        constexpr unsigned NR1 = old_sequence.mData[I1];
+        constexpr unsigned NR2 = old_sequence.mData[I2];
+        constexpr unsigned NR3 = old_sequence.mData[I3];
+
+        return Sequence<NR0, NR1, NR2, NR3>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
+    {
+        // don't know how to implement this
+        printf("Sequence::ReorderByPutOldToNew not implemented");
+        assert(false);
+    }
+
+    template <unsigned I>
+    __host__ __device__ constexpr auto PushBack(Number<I>) const
+    {
+        return Sequence<Is..., I>{};
+    }
+
+    __host__ __device__ constexpr auto PopBack() const;
+
+    template <class F>
+    __host__ __device__ constexpr auto Transform(F f) const
+    {
+        return Sequence<f(Is)...>{};
+    }
+};
+
+template <unsigned... Is, unsigned I>
+__host__ __device__ constexpr auto sequence_pop_back(Sequence<Is..., I>)
+{
+    static_assert(sizeof...(Is) >= 1, "empty Sequence!");
+    return Sequence<Is...>{};
+}
+
+template <class F, unsigned... Xs, unsigned... Ys>
+__host__ __device__ constexpr auto sequence_sequence_op(Sequence<Xs...>, Sequence<Ys...>, F f)
+{
+    static_assert(Sequence<Xs...>::nDim == Sequence<Ys...>::nDim, "Dim not the same");
+
+    return Sequence<f(Xs, Ys)...>{};
+}
+
+template <unsigned... Xs, unsigned... Ys>
+__host__ __device__ constexpr auto sequence_sequence_add(Sequence<Xs...>, Sequence<Ys...>)
+{
+    struct add
+    {
+        __host__ __device__ constexpr unsigned operator()(unsigned x, unsigned y) const
+        {
+            return x + y;
+        }
+    };
+
+    return sequence_sequence_op(Sequence<Xs...>{}, Sequence<Ys...>{}, add{});
+}
+
+template <unsigned... Is>
+__host__ __device__ constexpr auto Sequence<Is...>::PopBack() const
+{
+    return sequence_pop_back(Type{});
+}
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index 2df008fcad..f447fce784 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -1,4 +1,8 @@
 #pragma once
+#include "constant_integral.hip.hpp"
+#include "Sequence.hip.hpp"
+#include "Array.hip.hpp"
+#include "functional.hip.hpp"
 
 __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
 
@@ -91,54 +95,6 @@ struct vector_type<half, 8>
 };
 #endif
 
-template <class T, T N>
-struct integral_constant
-{
-    static const T value = N;
-
-    __host__ __device__ constexpr T Get() const { return value; }
-};
-
-template <unsigned N>
-using Number = integral_constant<unsigned, N>;
-
-template <unsigned... Is>
-struct Sequence
-{
-    using Type = Sequence<Is...>;
-
-    static constexpr unsigned nDim = sizeof...(Is);
-
-    const unsigned mData[nDim] = {Is...};
-
-    template <unsigned I>
-    __host__ __device__ constexpr unsigned Get(Number<I>) const
-    {
-        return mData[I];
-    }
-
-    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
-    __host__ __device__ constexpr auto ReorderByGetNewFromOld(Sequence<I0, I1, I2, I3>) const
-    {
-        constexpr auto old_sequence = Type{};
-
-        constexpr unsigned NR0 = old_sequence.mData[I0];
-        constexpr unsigned NR1 = old_sequence.mData[I1];
-        constexpr unsigned NR2 = old_sequence.mData[I2];
-        constexpr unsigned NR3 = old_sequence.mData[I3];
-
-        return Sequence<NR0, NR1, NR2, NR3>{};
-    }
-
-    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
-    __host__ __device__ constexpr auto ReorderByPutOldToNew(Sequence<I0, I1, I2, I3>) const
-    {
-        // don't know how to implement this
-        printf("Sequence::ReorderByPutOldToNew not implemented");
-        assert(false);
-    }
-};
-
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)
 {
diff --git a/src/include/constant_integral.hip.hpp b/src/include/constant_integral.hip.hpp
new file mode 100644
index 0000000000..70dc69d181
--- /dev/null
+++ b/src/include/constant_integral.hip.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+template <class T, T N>
+struct integral_constant
+{
+    static const T value = N;
+
+    __host__ __device__ constexpr T Get() const { return value; }
+};
+
+template <unsigned N>
+using Number = integral_constant<unsigned, N>;
diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp
new file mode 100644
index 0000000000..598d5c3c71
--- /dev/null
+++ b/src/include/functional.hip.hpp
@@ -0,0 +1,49 @@
+#pragma once
+#include "constant_integral.hip.hpp"
+
+template <unsigned NLoop>
+struct static_loop_n
+{
+    template <class F>
+    __host__ __device__ void operator()(F f) const
+    {
+        static_assert(NLoop > 1, "out-of-range");
+
+        f(Number<NLoop - 1>{});
+        static_loop_n<NLoop - 1>{}(f);
+    }
+};
+
+template <>
+struct static_loop_n<1>
+{
+    template <class F>
+    __host__ __device__ void operator()(F f) const
+    {
+        f(Number<0>{});
+    }
+};
+
+template <unsigned NLoop>
+struct static_const_reduce_n
+{
+    template <class F, class Reduce>
+    __host__ __device__ constexpr auto operator()(F f, Reduce r) const
+    {
+        static_assert(NLoop > 1, "out-of-range");
+
+        constexpr auto a = f(Number<NLoop - 1>{});
+        auto b = static_const_reduce_n<NLoop - 1>{}(f, r); // cannot use constexpr here, weird
+        return r(a, b);
+    }
+};
+
+template <>
+struct static_const_reduce_n<1>
+{
+    template <class F, class Reduce>
+    __host__ __device__ constexpr auto operator()(F f, Reduce) const
+    {
+        return f(Number<0>{});
+    }
+};
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
new file mode 100644
index 0000000000..cb2a8a5087
--- /dev/null
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -0,0 +1,195 @@
+#pragma once
+#include "common.hip.hpp"
+#include "ConstantTensorDescriptor.hip.hpp"
+#include "blockwise_4d_tensor_op.hip.hpp"
+#include "blockwise_direct_convolution.hip.hpp"
+#include "threadwise_4d_tensor_op.hip.hpp"
+#include "threadwise_direct_convolution.hip.hpp"
+
+template <class Float,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          unsigned NPerBlock,
+          unsigned KPerBlock,
+          unsigned CPerBlock,
+          unsigned HoPerBlock,
+          unsigned WoPerBlock,
+          unsigned NPerThread,
+          unsigned KPerThread,
+          unsigned CPerThread,
+          unsigned HoPerThread,
+          unsigned WoPerThread,
+          unsigned BlockSize,
+          unsigned GridSize>
+__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
+    const Float* const __restrict__ p_in_global,
+    const Float* const __restrict__ p_wei_global,
+    Float* const __restrict__ p_out_global)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_global_desc  = InGlobalDesc{};
+    constexpr auto wei_global_desc = WeiGlobalDesc{};
+    constexpr auto out_global_desc = OutGlobalDesc{};
+
+    constexpr unsigned Y = wei_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_global_desc.GetLength(I3);
+
+    constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
+    constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
+
+    constexpr auto in_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
+
+    constexpr auto wei_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
+
+    // shared mem
+    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
+    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+
+    __shared__ Float p_in_block[in_block_size];
+    __shared__ Float p_wei_block[wei_block_size];
+
+    // threadwise tensors
+    constexpr unsigned HiPerThread = HoPerThread + Y - 1;
+    constexpr unsigned WiPerThread = WoPerThread + X - 1;
+
+    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
+
+    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
+
+    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_thread_block_desc, wei_thread_block_desc);
+
+    // register
+    Float p_out_thread[out_thread_desc.GetElementSpace()];
+
+    // divide block work
+    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+
+    const unsigned block_id = blockIdx.x;
+
+    unsigned itmp                  = block_id;
+    const unsigned n_block_work_id = itmp / (KBlockWork * HBlockWork * WBlockWork);
+    itmp -= n_block_work_id * (KBlockWork * HBlockWork * WBlockWork);
+    const unsigned k_block_work_id = itmp / (HBlockWork * WBlockWork);
+    itmp -= k_block_work_id * (HBlockWork * WBlockWork);
+    const unsigned h_block_work_id = itmp / WBlockWork;
+    const unsigned w_block_work_id = itmp - h_block_work_id * WBlockWork;
+
+    const unsigned n_block_data_begin  = n_block_work_id * NPerBlock;
+    const unsigned k_block_data_begin  = k_block_work_id * KPerBlock;
+    const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
+    const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
+
+    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
+    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+
+    // divide thread work
+    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr unsigned HThreadWork = (HoPerBlock + HoPerThread - 1) / HoPerThread;
+    constexpr unsigned WThreadWork = (WoPerBlock + WoPerThread - 1) / WoPerThread;
+
+    const unsigned thread_id = threadIdx.x;
+
+    itmp                            = thread_id;
+    const unsigned n_thread_work_id = itmp / (KThreadWork * HThreadWork * WThreadWork);
+    itmp -= n_thread_work_id * (KThreadWork * HThreadWork * WThreadWork);
+    const unsigned k_thread_work_id = itmp / (HThreadWork * WThreadWork);
+    itmp -= k_thread_work_id * (HThreadWork * WThreadWork);
+    const unsigned h_thread_work_id = itmp / WThreadWork;
+    const unsigned w_thread_work_id = itmp - h_thread_work_id * WThreadWork;
+
+    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const unsigned ho_thread_data_begin = h_thread_work_id * HoPerThread;
+    const unsigned wo_thread_data_begin = w_thread_work_id * WoPerThread;
+
+    const unsigned hi_thread_data_begin = ho_thread_data_begin;
+    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+
+    constexpr auto blockwise_in_copy =
+        Blockwise4dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(in_global_desc),
+                               decltype(in_block_desc),
+                               decltype(in_block_desc.GetLengths())>{};
+
+    constexpr auto blockwise_wei_copy =
+        Blockwise4dTensorCopy1<BlockSize,
+                               Float,
+                               decltype(wei_global_desc),
+                               decltype(wei_block_desc),
+                               decltype(wei_block_desc.GetLengths())>{};
+
+    // set threadwise output tensor to 0
+    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
+
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
+        c_block_data_begin += CPerBlock, __syncthreads())
+    {
+        // copy input tensor to LDS
+        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
+                                                                      c_block_data_begin,
+                                                                      hi_block_data_begin,
+                                                                      wi_block_data_begin),
+                              p_in_block);
+
+        // copy weight tensor to LDS
+        blockwise_wei_copy.Run(
+            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_block);
+
+        __syncthreads();
+
+        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        {
+// threadwise convolution
+#if 1
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#elif 0
+            threadwise_direct_convolution_3(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#endif
+        }
+    }
+
+    // copy output tensor from register to global mem
+    threadwise_4d_tensor_copy(
+        out_thread_desc,
+        p_out_thread,
+        out_global_desc,
+        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                                  k_block_data_begin + k_thread_data_begin,
+                                                  ho_block_data_begin + ho_thread_data_begin,
+                                                  wo_block_data_begin + wo_thread_data_begin),
+        out_thread_desc.GetLengths());
+}

From 03eef73c5be07a1e02c090eacd24f0a9f6aa850e Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sun, 17 Mar 2019 15:36:38 -0500
Subject: [PATCH 03/13] refactoring block copy

---
 ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp |   9 +-
 driver/driver.hip.cpp                         |   8 +-
 src/include/blockwise_2d_tensor_op.hip.hpp    |   5 +-
 src/include/blockwise_4d_tensor_op.hip.hpp    | 124 ++++++++++++--
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 161 +++++++++++-------
 5 files changed, 224 insertions(+), 83 deletions(-)

diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
index d91757dc8f..2fc2264f6d 100644
--- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
@@ -1,7 +1,7 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
+//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
 #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
@@ -47,6 +47,9 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned HoPerThread = 2;
     constexpr unsigned WoPerThread = 2;
 
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
@@ -59,7 +62,7 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
     for(unsigned i = 0; i < nrepeat; ++i)
     {
         float time = launch_kernel(
-#if 0 
+#if 0
             gridwise_direct_convolution_2_nchw_kcyx_nkhw
 #else
             gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
@@ -78,6 +81,8 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
              CPerThread,
              HoPerThread,
              WoPerThread,
+             InBlockCopyDataPerRead,
+             WeiBlockCopyDataPerRead,
              BlockSize,
              GridSize>,
             dim3(GridSize),
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 6cd75afd79..a952b95380 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -7,11 +7,11 @@
 #include "tensor.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "conv_common.hip.hpp"
-#include "device_direct_convolution_1.hpp"
+//#include "device_direct_convolution_1.hpp"
 #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
-#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
-#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
-#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
+//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
+//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
 
 struct GeneratorTensor_1
 {
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index b54c4d0c5f..761c32a370 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -383,8 +383,9 @@ struct Blockwise2dTensorCopy3
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
 
-        static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1,
-                      "wrong! only support stride1 == 1!\n");
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1),
+                      "wrong! only support stride1 == 1 if DataPerRead > 1!\n");
 
         static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
                       "wrong! only support DataPerRead == 1, 2 or 4!\n");
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index cc50d9eecd..693b6fe9d5 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -131,11 +131,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
         did[3] = is / ref_desc.GetStride(I3);
 
-        const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+        const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
 
-        const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+        const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
 
-        f(p_src[aindex], p_dst[bindex]);
+        f(p_src[src_index], p_dst[dst_index]);
     }
 
     constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
@@ -162,11 +162,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
 
             did[3] = is / ref_desc.GetStride(I3);
 
-            const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
+            const unsigned src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]);
 
-            const unsigned bindex = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
+            const unsigned dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]);
 
-            f(p_src[aindex], p_dst[bindex]);
+            f(p_src[src_index], p_dst[dst_index]);
         }
     }
 }
@@ -199,15 +199,112 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
         SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }
 
-template <unsigned BlockSize, class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <unsigned BlockSize,
+          class Float,
+          class SrcDesc,
+          class DstDesc,
+          class CopyLengths,
+          unsigned DataPerRead>
 struct Blockwise4dTensorCopy1
 {
+    using vector_t = typename vector_type<Float, DataPerRead>::type;
+
+    __device__ void SanityCheck() const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
+                      "wrong! only support stride3 == 1 if DataPerRead > 1!\n");
+
+        static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
+                      "wrong! only support DataPerRead == 1, 2 or 4!\n");
+
+        static_assert(SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
+                          DstDesc{}.GetStride(I2) % DataPerRead == 0,
+                      "src and dst stride2 should be multiple of DataPerRead to keep alignment");
+
+        // we allow out-of-bound read from src in D3 dimension,
+        //   but we need to make sure dst stride2 is big enough,
+        //   so that the out-of-bound write won't contaminate next line in dst
+        constexpr unsigned L3          = CopyLengths{}.Get(I3);
+        constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
+
+        static_assert(read_per_d3 * DataPerRead <= DstDesc{}.GetStride(I2),
+                      "wrong! out-of-bound write will contaminate next line!\n");
+    }
+
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
+        SanityCheck();
 
-        blockwise_4d_tensor_copy_reorder_by_get_dst_from_src<BlockSize>(
-            SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, dst_from_src_reorder);
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto src_desc = SrcDesc{};
+        constexpr auto dst_desc = DstDesc{};
+
+        constexpr unsigned L0 = CopyLengths{}.Get(I0);
+        constexpr unsigned L1 = CopyLengths{}.Get(I1);
+        constexpr unsigned L2 = CopyLengths{}.Get(I2);
+        constexpr unsigned L3 = CopyLengths{}.Get(I3);
+
+        constexpr unsigned read_per_d3 = integer_divide_ceil(L3, DataPerRead);
+
+        constexpr auto ref_desc =
+            make_ConstantTensorDescriptor(Sequence<L0, L1, L2, read_per_d3>{});
+
+        constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
+
+        auto f_copy = [&](unsigned is) {
+            unsigned did[4];
+
+            did[0] = is / ref_desc.GetStride(I0);
+
+            is -= did[0] * ref_desc.GetStride(I0);
+
+            did[1] = is / ref_desc.GetStride(I1);
+
+            is -= did[1] * ref_desc.GetStride(I1);
+
+            did[2] = is / ref_desc.GetStride(I2);
+
+            is -= did[2] * ref_desc.GetStride(I2);
+
+            did[3] = is / ref_desc.GetStride(I3);
+
+            const unsigned src_index =
+                src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+            const unsigned dst_index =
+                dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead);
+
+            *(reinterpret_cast<vector_t*>(p_dst + dst_index)) =
+                *(reinterpret_cast<const vector_t*>(p_src + src_index));
+        };
+
+        for(unsigned iloop = 0; iloop < NLoop; ++iloop)
+        {
+            unsigned is = threadIdx.x + iloop * BlockSize;
+
+            f_copy(is);
+        }
+
+        constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize);
+
+        if(has_tail)
+        {
+            unsigned is = threadIdx.x + NLoop * BlockSize;
+
+            if(is < ref_desc.GetElementSize())
+            {
+                f_copy(is);
+            }
+        }
     }
 };
 
@@ -361,8 +458,9 @@ struct Blockwise4dTensorCopy3
         constexpr auto I2 = Number<2>{};
         constexpr auto I3 = Number<3>{};
 
-        static_assert(SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1,
-                      "wrong! only support stride3 == 1!\n");
+        static_assert(DataPerRead == 1 ||
+                          (SrcDesc{}.GetStride(I3) == 1 && DstDesc{}.GetStride(I3) == 1),
+                      "wrong! only support stride3 == 1 if DataPerRead > 1!\n");
 
         static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4,
                       "wrong! only support DataPerRead == 1, 2 or 4!\n");
@@ -370,7 +468,7 @@ struct Blockwise4dTensorCopy3
         static_assert(
             SrcDesc{}.GetStride(I2) % DataPerRead == 0 &&
                 DstDesc{}.GetStride(I2) % DataPerRead == 0,
-            "wrong! src and dst stride should be multiple of DataPerRead to keep alignment");
+            "wrong! src and dst stride2 should be multiple of DataPerRead to keep alignment");
 
         constexpr unsigned L0 = CopyLengths{}.Get(I0);
         constexpr unsigned L1 = CopyLengths{}.Get(I1);
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index cb2a8a5087..8ce097bdc5 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
 #include "blockwise_4d_tensor_op.hip.hpp"
 #include "blockwise_direct_convolution.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
@@ -20,6 +21,8 @@ template <class Float,
           unsigned CPerThread,
           unsigned HoPerThread,
           unsigned WoPerThread,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
@@ -32,50 +35,72 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
+    constexpr auto in_nchw_global_desc  = InGlobalDesc{};
+    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
+    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
 
-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
+    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+
+    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
+        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
 
     constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
     constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
 
-    constexpr auto in_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
+    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
 
-    constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
+    constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<KPerBlock, CPerBlock * Y * X>{},
+        Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
+
+    constexpr auto wei_kcyx_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
+                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
 
     // shared mem
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr unsigned in_block_size =
+        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+    constexpr unsigned wei_block_size =
+        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
 
-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // threadwise tensors
     constexpr unsigned HiPerThread = HoPerThread + Y - 1;
     constexpr unsigned WiPerThread = WoPerThread + X - 1;
 
-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
+    constexpr auto in_nchw_thread_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
+                                      in_nchw_block_desc.GetStrides());
 
-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
+    constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
 
-    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_thread_block_desc, wei_thread_block_desc);
+    constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
 
     // register
-    Float p_out_thread[out_thread_desc.GetElementSpace()];
+    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
 
     // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr unsigned NBlockWork =
+        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork =
+        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned HBlockWork =
+        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned WBlockWork =
+        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
 
     const unsigned block_id = blockIdx.x;
 
@@ -122,34 +147,44 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(in_global_desc),
-                               decltype(in_block_desc),
-                               decltype(in_block_desc.GetLengths())>{};
+                               decltype(in_nchw_global_desc),
+                               decltype(in_nchw_block_desc),
+                               decltype(in_nchw_block_desc.GetLengths()),
+                               InBlockCopyDataPerRead>{};
 
+#if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(wei_global_desc),
-                               decltype(wei_block_desc),
-                               decltype(wei_block_desc.GetLengths())>{};
+                               decltype(wei_kcyx_global_desc),
+                               decltype(wei_kcyx_block_desc),
+                               decltype(wei_kcyx_block_desc.GetLengths())>{};
+#elif 1
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ke_global_desc),
+                                                           decltype(wei_ke_block_desc),
+                                                           decltype(wei_ke_block_desc.GetLengths()),
+                                                           WeiBlockCopyDataPerRead>{};
+#endif
 
     // set threadwise output tensor to 0
-    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
+    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
 
-    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
-                                                                      c_block_data_begin,
-                                                                      hi_block_data_begin,
-                                                                      wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+                                                                           c_block_data_begin,
+                                                                           hi_block_data_begin,
+                                                                           wi_block_data_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(
-            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
-            p_wei_block);
+        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
+                                                  k_block_data_begin, c_block_data_begin, 0, 0),
+                               p_wei_block);
 
         __syncthreads();
 
@@ -158,25 +193,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 // threadwise convolution
 #if 1
             threadwise_direct_convolution_2(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                in_nchw_thread_block_desc,
+                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                           c_thread_data,
+                                                           hi_thread_data_begin,
+                                                           wi_thread_data_begin),
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                 p_out_thread);
 #elif 0
             threadwise_direct_convolution_3(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                in_nchw_thread_block_desc,
+                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                           c_thread_data,
+                                                           hi_thread_data_begin,
+                                                           wi_thread_data_begin),
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                 p_out_thread);
 #endif
         }
@@ -184,12 +221,12 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 
     // copy output tensor from register to global mem
     threadwise_4d_tensor_copy(
-        out_thread_desc,
+        out_nkhw_thread_desc,
         p_out_thread,
-        out_global_desc,
-        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin),
-        out_thread_desc.GetLengths());
+        out_nkhw_global_desc,
+        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                                       k_block_data_begin + k_thread_data_begin,
+                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                       wo_block_data_begin + wo_thread_data_begin),
+        out_nkhw_thread_desc.GetLengths());
 }

From 7faf269c995e5594935a16dfdae75a49f62f4991 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Sun, 17 Mar 2019 21:48:46 -0500
Subject: [PATCH 04/13] refactor

---
 ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 170 +++++++++++-------
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp |   3 +-
 2 files changed, 106 insertions(+), 67 deletions(-)

diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index 322d5fd9c2..da4542d9cb 100644
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "common.hip.hpp"
 #include "ConstantTensorDescriptor.hip.hpp"
+#include "blockwise_2d_tensor_op.hip.hpp"
 #include "blockwise_4d_tensor_op.hip.hpp"
 #include "blockwise_direct_convolution.hip.hpp"
 #include "threadwise_4d_tensor_op.hip.hpp"
@@ -20,62 +21,86 @@ template <class Float,
           unsigned CPerThread,
           unsigned HoPerThread,
           unsigned WoPerThread,
+          unsigned InBlockCopyDataPerRead,
+          unsigned WeiBlockCopyDataPerRead,
           unsigned BlockSize,
           unsigned GridSize>
-__global__ void
-gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
-                                             const Float* const __restrict__ p_wei_global,
-                                             Float* const __restrict__ p_out_global)
+__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
+    const Float* const __restrict__ p_in_global,
+    const Float* const __restrict__ p_wei_global,
+    Float* const __restrict__ p_out_global)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
+    constexpr auto in_nchw_global_desc  = InGlobalDesc{};
+    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
+    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
 
-    constexpr unsigned Y = wei_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_global_desc.GetLength(I3);
+    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
+    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+
+    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
+        Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
 
     constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
     constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
 
-    constexpr auto in_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
+    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
 
-    constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{});
+    constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
+        Sequence<KPerBlock, CPerBlock * Y * X>{},
+        Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
+
+    constexpr auto wei_kcyx_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
+                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
 
     // shared mem
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+    constexpr unsigned in_block_size =
+        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+    constexpr unsigned wei_block_size =
+        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
 
-    __shared__ Float p_in_block[in_block_size];
-    __shared__ Float p_wei_block[wei_block_size];
+    constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
+                                       ? InBlockCopyDataPerRead
+                                       : WeiBlockCopyDataPerRead;
+
+    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // threadwise tensors
     constexpr unsigned HiPerThread = HoPerThread + Y - 1;
     constexpr unsigned WiPerThread = WoPerThread + X - 1;
 
-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{}, in_block_desc.GetStrides());
+    constexpr auto in_nchw_thread_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
+                                      in_nchw_block_desc.GetStrides());
 
-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, Y, X>{}, wei_block_desc.GetStrides());
+    constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
 
-    constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_thread_block_desc, wei_thread_block_desc);
+    constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
+        in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
 
     // register
-    Float p_out_thread[out_thread_desc.GetElementSpace()];
+    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
 
     // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned HBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned WBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+    constexpr unsigned NBlockWork =
+        (out_nkhw_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork =
+        (out_nkhw_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned HBlockWork =
+        (out_nkhw_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned WBlockWork =
+        (out_nkhw_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
 
     const unsigned block_id = blockIdx.x;
 
@@ -122,34 +147,45 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(in_global_desc),
-                               decltype(in_block_desc),
-                               decltype(in_block_desc.GetLengths())>{};
+                               decltype(in_nchw_global_desc),
+                               decltype(in_nchw_block_desc),
+                               decltype(in_nchw_block_desc.GetLengths()),
+                               InBlockCopyDataPerRead>{};
 
+#if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(wei_global_desc),
-                               decltype(wei_block_desc),
-                               decltype(wei_block_desc.GetLengths())>{};
+                               decltype(wei_kcyx_global_desc),
+                               decltype(wei_kcyx_block_desc),
+                               decltype(wei_kcyx_block_desc.GetLengths()),
+                               1>{};
+#elif 1
+    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
+                                                           Float,
+                                                           decltype(wei_ke_global_desc),
+                                                           decltype(wei_ke_block_desc),
+                                                           decltype(wei_ke_block_desc.GetLengths()),
+                                                           WeiBlockCopyDataPerRead>{};
+#endif
 
     // set threadwise output tensor to 0
-    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
+    threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
 
-    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin,
-                                                                      c_block_data_begin,
-                                                                      hi_block_data_begin,
-                                                                      wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+                                                                           c_block_data_begin,
+                                                                           hi_block_data_begin,
+                                                                           wi_block_data_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(
-            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
-            p_wei_block);
+        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
+                                                  k_block_data_begin, c_block_data_begin, 0, 0),
+                               p_wei_block);
 
         __syncthreads();
 
@@ -158,25 +194,27 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 // threadwise convolution
 #if 1
             threadwise_direct_convolution_2(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                in_nchw_thread_block_desc,
+                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                           c_thread_data,
+                                                           hi_thread_data_begin,
+                                                           wi_thread_data_begin),
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                 p_out_thread);
 #elif 0
             threadwise_direct_convolution_3(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
+                in_nchw_thread_block_desc,
+                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                           c_thread_data,
+                                                           hi_thread_data_begin,
+                                                           wi_thread_data_begin),
+                wei_kcyx_thread_block_desc,
+                p_wei_block +
+                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_nkhw_thread_desc,
                 p_out_thread);
 #endif
         }
@@ -184,12 +222,12 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 
     // copy output tensor from register to global mem
     threadwise_4d_tensor_copy(
-        out_thread_desc,
+        out_nkhw_thread_desc,
         p_out_thread,
-        out_global_desc,
-        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin),
-        out_thread_desc.GetLengths());
+        out_nkhw_global_desc,
+        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                                       k_block_data_begin + k_thread_data_begin,
+                                                       ho_block_data_begin + ho_thread_data_begin,
+                                                       wo_block_data_begin + wo_thread_data_begin),
+        out_nkhw_thread_desc.GetLengths());
 }
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 8ce097bdc5..da4542d9cb 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -158,7 +158,8 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                Float,
                                decltype(wei_kcyx_global_desc),
                                decltype(wei_kcyx_block_desc),
-                               decltype(wei_kcyx_block_desc.GetLengths())>{};
+                               decltype(wei_kcyx_block_desc.GetLengths()),
+                               1>{};
 #elif 1
     const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
                                                            Float,

From 4f0fc72e913965d92d5410eb2a1c4a0e6b1fce6f Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 18 Mar 2019 15:03:17 -0500
Subject: [PATCH 05/13] adding fp16 direct that reads pre-vectorized data

---
 ...ce_direct_convolution_2_nchw_kcyx_nkhw.hpp |  74 ++++----
 ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 160 ++++++++++++++++++
 driver/driver.hip.cpp                         | 157 ++++++++---------
 src/include/blockwise_2d_tensor_op.hip.hpp    |   2 +-
 src/include/blockwise_4d_tensor_op.hip.hpp    |   8 +-
 src/include/common.hip.hpp                    |  38 +++--
 ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp |   8 +-
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp |  97 ++++++-----
 8 files changed, 369 insertions(+), 175 deletions(-)
 create mode 100644 driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp

diff --git a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
index 2fc2264f6d..4a49ff9fc8 100644
--- a/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_nchw_kcyx_nkhw.hpp
@@ -1,8 +1,7 @@
 #pragma once
 #include <unistd.h>
 #include "device.hpp"
-//#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
-#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
+#include "gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp"
 
 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
@@ -50,6 +49,24 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 4;
 
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 3x3, 34x34, 128 thread, fp16
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
@@ -61,35 +78,30 @@ void device_direct_convolution_2_nchw_kcyx_nkhw(InDesc,
 
     for(unsigned i = 0; i < nrepeat; ++i)
     {
-        float time = launch_kernel(
-#if 0
-            gridwise_direct_convolution_2_nchw_kcyx_nkhw
-#else
-            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw
-#endif
-            <T,
-             InDesc,
-             WeiDesc,
-             OutDesc,
-             NPerBlock,
-             KPerBlock,
-             CPerBlock,
-             HoPerBlock,
-             WoPerBlock,
-             NPerThread,
-             KPerThread,
-             CPerThread,
-             HoPerThread,
-             WoPerThread,
-             InBlockCopyDataPerRead,
-             WeiBlockCopyDataPerRead,
-             BlockSize,
-             GridSize>,
-            dim3(GridSize),
-            dim3(BlockSize),
-            static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+        float time =
+            launch_kernel(gridwise_direct_convolution_2_nchw_kcyx_nkhw<T,
+                                                                       InDesc,
+                                                                       WeiDesc,
+                                                                       OutDesc,
+                                                                       NPerBlock,
+                                                                       KPerBlock,
+                                                                       CPerBlock,
+                                                                       HoPerBlock,
+                                                                       WoPerBlock,
+                                                                       NPerThread,
+                                                                       KPerThread,
+                                                                       CPerThread,
+                                                                       HoPerThread,
+                                                                       WoPerThread,
+                                                                       InBlockCopyDataPerRead,
+                                                                       WeiBlockCopyDataPerRead,
+                                                                       BlockSize,
+                                                                       GridSize>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                          static_cast<T*>(out_device_buf.GetDeviceBuffer()));
 
         printf("Elapsed time : %f ms\n", time);
         usleep(std::min(time * 1000, float(10000)));
diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
new file mode 100644
index 0000000000..19633f0462
--- /dev/null
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,160 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
+                                                           const Tensor<T>& in_nchw,
+                                                           WeiDesc,
+                                                           const Tensor<T>& wei_kcyx,
+                                                           OutDesc,
+                                                           Tensor<T>& out_nkhw,
+                                                           unsigned nrepeat)
+{
+    constexpr unsigned NVector = 1;
+    using vector_type_t        = vector_type<T, NVector>;
+    using vector_t             = typename vector_type_t::VectorType;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
+    constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr unsigned N  = out_nkhw_desc.GetLength(I0);
+    constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
+    constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr unsigned K = wei_kcyx_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_desc.GetLength(I3);
+
+    // vectorized input
+    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
+    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
+
+    Tensor<vector_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
+
+    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
+#if 1
+        in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
+#else
+        in_nchw_vec(n, c, h, w) =
+            vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)(
+        std::thread::hardware_concurrency());
+
+    // vectorize weight
+    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
+    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
+
+    Tensor<vector_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
+
+    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
+#if 1
+        wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
+#else
+        wei_kcyx_vec(k, c, y, x) =
+            vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency());
+
+    //
+    DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
+    wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 1
+    // 3x3, 34x34, 128 thread
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 3x3, 34x34, 128 thread, fp16
+    constexpr unsigned NPerBlock = 2;
+    constexpr unsigned KPerBlock = 32;
+    constexpr unsigned CPerBlock = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread = 2;
+    constexpr unsigned KPerThread = 4;
+    constexpr unsigned CPerThread = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned BlockSize = 128;
+#endif
+
+    constexpr unsigned GridSize =
+        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(unsigned i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(
+            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<T,
+                                                                    decltype(in_nchw_vec_desc),
+                                                                    decltype(wei_kcyx_vec_desc),
+                                                                    decltype(out_nkhw_desc),
+                                                                    NVector,
+                                                                    NPerBlock,
+                                                                    KPerBlock,
+                                                                    CPerBlock,
+                                                                    HoPerBlock,
+                                                                    WoPerBlock,
+                                                                    NPerThread,
+                                                                    KPerThread,
+                                                                    CPerThread,
+                                                                    HoPerThread,
+                                                                    WoPerThread,
+                                                                    InBlockCopyDataPerRead,
+                                                                    WeiBlockCopyDataPerRead,
+                                                                    BlockSize,
+                                                                    GridSize>,
+            dim3(GridSize),
+            dim3(BlockSize),
+            static_cast<T*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index a952b95380..47ff1b6882 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -9,6 +9,7 @@
 #include "conv_common.hip.hpp"
 //#include "device_direct_convolution_1.hpp"
 #include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
+#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
 //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
 //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
 //#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
@@ -34,25 +35,6 @@ struct GeneratorTensor_2
     }
 };
 
-struct GeneratorTensor_3
-{
-    template <class... Is>
-    double operator()(Is... is)
-    {
-#if 0
-        std::initializer_list<std::size_t> ls = {static_cast<std::size_t>(is)...};
-        return std::accumulate(ls.begin(), ls.end(), std::size_t(0));
-#elif 1
-        assert(sizeof...(Is) > 0);
-        std::initializer_list<std::size_t> ids = {static_cast<std::size_t>(is)...};
-        std::vector<std::size_t> lens(sizeof...(Is), 100);
-        std::vector<std::size_t> strides(sizeof...(Is), 1);
-        std::partial_sum(lens.rbegin(), lens.rbegin() + (sizeof...(Is) - 1), strides.rbegin() + 1);
-        return std::inner_product(ids.begin(), ids.end(), strides.begin(), std::size_t(0)) + 1;
-#endif
-    }
-};
-
 struct GeneratorTensor_Checkboard
 {
     template <class... Ts>
@@ -129,7 +111,7 @@ void host_direct_convolution(
                     if(hi >= 0 && hi < in_nchw.mDesc.GetLengths()[2] && wi >= 0 &&
                        wi < in_nchw.mDesc.GetLengths()[3])
                     {
-                        v += in_nchw(n, c, hi, wi) * wei_kcyx(k, c, y, x);
+                        v += double(in_nchw(n, c, hi, wi)) * double(wei_kcyx(k, c, y, x));
                     }
                 }
             }
@@ -177,11 +159,11 @@ void host_winograd_3x3_convolution(
     std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
     std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
 
-    Tensor<T> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<T> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<T> wei_transform({K, C, HiPerTile, WiPerTile});
-    Tensor<T> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
-    Tensor<T> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
+    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
+    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
+    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
 
     auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
         for(int j = 0; j < HiPerTile; ++j)
@@ -259,49 +241,61 @@ void host_winograd_3x3_convolution(
     };
 
     auto f_wei_transform = [&](auto k, auto c) {
-        wei_transform(k, c, 0, 0) = wei_kcyx(k, c, 0, 0);
-        wei_transform(k, c, 0, 1) =
-            0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
-        wei_transform(k, c, 0, 2) =
-            0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 0, 1) + 0.5 * wei_kcyx(k, c, 0, 2);
-        wei_transform(k, c, 0, 3) = wei_kcyx(k, c, 0, 2);
+        wei_transform(k, c, 0, 0) = double(wei_kcyx(k, c, 0, 0));
+        wei_transform(k, c, 0, 1) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 2) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 0, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 0, 2));
+        wei_transform(k, c, 0, 3) = double(wei_kcyx(k, c, 0, 2));
 
-        wei_transform(k, c, 1, 0) =
-            0.5 * wei_kcyx(k, c, 0, 0) + 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 1, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) +
-                                    0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 1, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) + 0.25 * wei_kcyx(k, c, 1, 0) -
-                                    0.25 * wei_kcyx(k, c, 1, 1) + 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 1, 3) =
-            0.5 * wei_kcyx(k, c, 0, 2) + 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
+        wei_transform(k, c, 1, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 1, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) + 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) + 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 1, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
 
-        wei_transform(k, c, 2, 0) =
-            0.5 * wei_kcyx(k, c, 0, 0) - 0.5 * wei_kcyx(k, c, 1, 0) + 0.5 * wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 2, 1) = 0.25 * wei_kcyx(k, c, 0, 0) + 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) -
-                                    0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) + 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 2, 2) = 0.25 * wei_kcyx(k, c, 0, 0) - 0.25 * wei_kcyx(k, c, 0, 1) +
-                                    0.25 * wei_kcyx(k, c, 0, 2) - 0.25 * wei_kcyx(k, c, 1, 0) +
-                                    0.25 * wei_kcyx(k, c, 1, 1) - 0.25 * wei_kcyx(k, c, 1, 2) +
-                                    0.25 * wei_kcyx(k, c, 2, 0) - 0.25 * wei_kcyx(k, c, 2, 1) +
-                                    0.25 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 2, 3) =
-            0.5 * wei_kcyx(k, c, 0, 2) - 0.5 * wei_kcyx(k, c, 1, 2) + 0.5 * wei_kcyx(k, c, 2, 2);
+        wei_transform(k, c, 2, 0) = 0.5 * double(wei_kcyx(k, c, 0, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 2, 1) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) + 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) -
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) + 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 2) =
+            0.25 * double(wei_kcyx(k, c, 0, 0)) - 0.25 * double(wei_kcyx(k, c, 0, 1)) +
+            0.25 * double(wei_kcyx(k, c, 0, 2)) - 0.25 * double(wei_kcyx(k, c, 1, 0)) +
+            0.25 * double(wei_kcyx(k, c, 1, 1)) - 0.25 * double(wei_kcyx(k, c, 1, 2)) +
+            0.25 * double(wei_kcyx(k, c, 2, 0)) - 0.25 * double(wei_kcyx(k, c, 2, 1)) +
+            0.25 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 2, 3) = 0.5 * double(wei_kcyx(k, c, 0, 2)) -
+                                    0.5 * double(wei_kcyx(k, c, 1, 2)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
 
-        wei_transform(k, c, 3, 0) = wei_kcyx(k, c, 2, 0);
-        wei_transform(k, c, 3, 1) =
-            0.5 * wei_kcyx(k, c, 2, 0) + 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 3, 2) =
-            0.5 * wei_kcyx(k, c, 2, 0) - 0.5 * wei_kcyx(k, c, 2, 1) + 0.5 * wei_kcyx(k, c, 2, 2);
-        wei_transform(k, c, 3, 3) = wei_kcyx(k, c, 2, 2);
+        wei_transform(k, c, 3, 0) = double(wei_kcyx(k, c, 2, 0));
+        wei_transform(k, c, 3, 1) = 0.5 * double(wei_kcyx(k, c, 2, 0)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 2) = 0.5 * double(wei_kcyx(k, c, 2, 0)) -
+                                    0.5 * double(wei_kcyx(k, c, 2, 1)) +
+                                    0.5 * double(wei_kcyx(k, c, 2, 2));
+        wei_transform(k, c, 3, 3) = double(wei_kcyx(k, c, 2, 2));
     };
 
     auto f_out_transform = [&](auto n, auto k, auto htile, auto wtile) {
@@ -372,20 +366,25 @@ void host_winograd_3x3_convolution(
 template <class T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
+    // printf("\n");
+
     float error     = 0;
     float max_diff  = -1;
     float ref_value = 0, result_value = 0;
     for(int i = 0; i < ref.mData.size(); ++i)
     {
-        error += std::abs(ref.mData[i] - result.mData[i]);
-        float diff = std::abs(ref.mData[i] - result.mData[i]);
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
         if(max_diff < diff)
         {
             max_diff     = diff;
             ref_value    = ref.mData[i];
             result_value = result.mData[i];
         }
+
+        // printf("{%f, %f}", double(ref.mData[i]), double(result.mData[i]));
     }
+    // printf("\n");
 
     std::cout << "error: " << error << std::endl;
     std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
@@ -406,13 +405,13 @@ int main(int argc, char* argv[])
     constexpr unsigned WPad = 0;
 #elif 1
     // 3x3, 34x34
-    constexpr unsigned N = 64;
-    constexpr unsigned C = 256;
+    constexpr unsigned N  = 64;
+    constexpr unsigned C  = 256;
     constexpr unsigned HI = 34;
     constexpr unsigned WI = 34;
-    constexpr unsigned K = 64;
-    constexpr unsigned Y = 3;
-    constexpr unsigned X = 3;
+    constexpr unsigned K  = 64;
+    constexpr unsigned Y  = 3;
+    constexpr unsigned X  = 3;
 
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
@@ -603,16 +602,22 @@ int main(int argc, char* argv[])
         in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #elif 1
-        in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
+
+        auto gen_wei = [](auto... is) {
+            return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
+        };
+        wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
 #endif
     }
 
 #if 1
 #if 0
     device_direct_convolution_1
-#elif 1
+#elif 0
     device_direct_convolution_2_nchw_kcyx_nkhw
+#elif 1
+    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
     device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 0
@@ -634,7 +639,6 @@ int main(int argc, char* argv[])
 
     if(do_verification)
     {
-#if 1
         if(Y == 3 && X == 3)
         {
             host_winograd_3x3_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
@@ -644,7 +648,6 @@ int main(int argc, char* argv[])
             host_direct_convolution(in_nchw, wei_kcyx, out_nkhw_host, lower_pads, upper_pads);
         }
         check_error(out_nkhw_host, out_nkhw_device);
-#endif
 
 #if 0
         LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index 761c32a370..969f18c4e8 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise2dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
 
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index 693b6fe9d5..e8829c0cbf 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -207,9 +207,9 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise4dTensorCopy1
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
 
-    __device__ void SanityCheck() const
+    __device__ constexpr Blockwise4dTensorCopy1()
     {
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
@@ -239,8 +239,6 @@ struct Blockwise4dTensorCopy1
 
     __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
     {
-        SanityCheck();
-
         constexpr auto I0 = Number<0>{};
         constexpr auto I1 = Number<1>{};
         constexpr auto I2 = Number<2>{};
@@ -446,7 +444,7 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise4dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
 
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index f447fce784..aa7e2269f6 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -28,44 +28,44 @@ struct vector_type
 template <>
 struct vector_type<float, 1>
 {
-    using type = float;
+    using VectorType = float;
 };
 
 template <>
 struct vector_type<float, 2>
 {
-    using type = float2;
+    using VectorType = float2;
 };
 
 template <>
 struct vector_type<float, 4>
 {
-    using type = float4;
+    using VectorType = float4;
 };
 
 #if 0
 template <>
 struct vector_type<half_float::half, 1>
 {
-    using type = half_float::half;
+    using VectorType = half_float::half;
 };
 
 template <>
 struct vector_type<half_float::half, 2>
 {
-    using type = float;
+    using VectorType = float;
 };
 
 template <>
 struct vector_type<half_float::half, 4>
 {
-    using type = float2;
+    using VectorType = float2;
 };
 
 template <>
 struct vector_type<half_float::half, 8>
 {
-    using type = float4;
+    using VectorType = float4;
 };
 #endif
 
@@ -73,25 +73,41 @@ struct vector_type<half_float::half, 8>
 template <>
 struct vector_type<half, 1>
 {
-    using type = half;
+    using VectorType = half;
+
+    __host__ __device__ static VectorType pack(half s) { return s; }
 };
 
 template <>
 struct vector_type<half, 2>
 {
-    using type = half2;
+    using VectorType = half2;
+
+    union Data
+    {
+        VectorType vector;
+        half scalar[2];
+    };
+
+    __host__ __device__ static VectorType pack(half s0, half s1)
+    {
+        Data data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
 
 template <>
 struct vector_type<half, 4>
 {
-    using type = float2;
+    using VectorType = float2;
 };
 
 template <>
 struct vector_type<half, 8>
 {
-    using type = float4;
+    using VectorType = float4;
 };
 #endif
 
diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index da4542d9cb..5761a22c16 100644
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -25,10 +25,10 @@ template <class Float,
           unsigned WeiBlockCopyDataPerRead,
           unsigned BlockSize,
           unsigned GridSize>
-__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const Float* const __restrict__ p_in_global,
-    const Float* const __restrict__ p_wei_global,
-    Float* const __restrict__ p_out_global)
+__global__ void
+gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_in_global,
+                                             const Float* const __restrict__ p_wei_global,
+                                             Float* const __restrict__ p_out_global)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index da4542d9cb..2b3cb03b78 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -11,6 +11,7 @@ template <class Float,
           class InGlobalDesc,
           class WeiGlobalDesc,
           class OutGlobalDesc,
+          unsigned ScalarPerVector,
           unsigned NPerBlock,
           unsigned KPerBlock,
           unsigned CPerBlock,
@@ -26,47 +27,50 @@ template <class Float,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const Float* const __restrict__ p_in_global,
-    const Float* const __restrict__ p_wei_global,
+    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_in_global,
+    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_wei_global,
     Float* const __restrict__ p_out_global)
 {
+    using scalar_t = Float;
+    using vector_t = typename vector_type<scalar_t, ScalarPerVector>::VectorType;
+
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_nchw_global_desc  = InGlobalDesc{};
-    constexpr auto wei_kcyx_global_desc = WeiGlobalDesc{};
-    constexpr auto out_nkhw_global_desc = OutGlobalDesc{};
+    constexpr auto in_nchw_vec_global_desc  = InGlobalDesc{};
+    constexpr auto wei_kcyx_vec_global_desc = WeiGlobalDesc{};
+    constexpr auto out_nkhw_global_desc     = OutGlobalDesc{};
 
-    constexpr unsigned N = in_nchw_global_desc.GetLength(I0);
-    constexpr unsigned K = wei_kcyx_global_desc.GetLength(I0);
-    constexpr unsigned C = wei_kcyx_global_desc.GetLength(I1);
-    constexpr unsigned Y = wei_kcyx_global_desc.GetLength(I2);
-    constexpr unsigned X = wei_kcyx_global_desc.GetLength(I3);
+    constexpr unsigned N = in_nchw_vec_global_desc.GetLength(I0);
+    constexpr unsigned K = wei_kcyx_vec_global_desc.GetLength(I0);
+    constexpr unsigned C = wei_kcyx_vec_global_desc.GetLength(I1);
+    constexpr unsigned Y = wei_kcyx_vec_global_desc.GetLength(I2);
+    constexpr unsigned X = wei_kcyx_vec_global_desc.GetLength(I3);
 
-    constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor(
+    constexpr auto wei_ke_vec_global_desc = make_ConstantTensorDescriptor(
         Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy
 
     constexpr unsigned HiPerBlock = HoPerBlock + Y - 1;
     constexpr unsigned WiPerBlock = WoPerBlock + X - 1;
 
-    constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
+    constexpr auto in_nchw_vec_block_desc = make_ConstantTensorDescriptor_aligned(
         Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, Number<InBlockCopyDataPerRead>{});
 
-    constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
+    constexpr auto wei_ke_vec_block_desc = make_ConstantTensorDescriptor_aligned(
         Sequence<KPerBlock, CPerBlock * Y * X>{},
         Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy
 
-    constexpr auto wei_kcyx_block_desc =
+    constexpr auto wei_kcyx_vec_block_desc =
         make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
-                                      Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
+                                      Sequence<wei_ke_vec_block_desc.GetStride(I0), Y * X, X, 1>{});
 
     // shared mem
     constexpr unsigned in_block_size =
-        in_nchw_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+        in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
     constexpr unsigned wei_block_size =
-        wei_kcyx_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
+        wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
 
     constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead
                                        ? InBlockCopyDataPerRead
@@ -81,10 +85,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 
     constexpr auto in_nchw_thread_block_desc =
         make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
-                                      in_nchw_block_desc.GetStrides());
+                                      in_nchw_vec_block_desc.GetStrides());
 
     constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());
+        Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_vec_block_desc.GetStrides());
 
     constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
         in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
@@ -147,26 +151,27 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(in_nchw_global_desc),
-                               decltype(in_nchw_block_desc),
-                               decltype(in_nchw_block_desc.GetLengths()),
+                               decltype(in_nchw_vec_global_desc),
+                               decltype(in_nchw_vec_block_desc),
+                               decltype(in_nchw_vec_block_desc.GetLengths()),
                                InBlockCopyDataPerRead>{};
 
 #if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
                                Float,
-                               decltype(wei_kcyx_global_desc),
-                               decltype(wei_kcyx_block_desc),
-                               decltype(wei_kcyx_block_desc.GetLengths()),
+                               decltype(wei_kcyx_vec_global_desc),
+                               decltype(wei_kcyx_vec_block_desc),
+                               decltype(wei_kcyx_vec_block_desc.GetLengths()),
                                1>{};
 #elif 1
-    const auto blockwise_wei_copy = Blockwise2dTensorCopy3<BlockSize,
-                                                           Float,
-                                                           decltype(wei_ke_global_desc),
-                                                           decltype(wei_ke_block_desc),
-                                                           decltype(wei_ke_block_desc.GetLengths()),
-                                                           WeiBlockCopyDataPerRead>{};
+    const auto blockwise_wei_copy =
+        Blockwise2dTensorCopy3<BlockSize,
+                               Float,
+                               decltype(wei_ke_vec_global_desc),
+                               decltype(wei_ke_vec_block_desc),
+                               decltype(wei_ke_vec_block_desc.GetLengths()),
+                               WeiBlockCopyDataPerRead>{};
 #endif
 
     // set threadwise output tensor to 0
@@ -176,14 +181,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
-                                                                           c_block_data_begin,
-                                                                           hi_block_data_begin,
-                                                                           wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
+                                                                               c_block_data_begin,
+                                                                               hi_block_data_begin,
+                                                                               wi_block_data_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
+        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_vec_global_desc.Get1dIndex(
                                                   k_block_data_begin, c_block_data_begin, 0, 0),
                                p_wei_block);
 
@@ -195,25 +200,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 1
             threadwise_direct_convolution_2(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                               c_thread_data,
+                                                               hi_thread_data_begin,
+                                                               wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
-                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                 out_nkhw_thread_desc,
                 p_out_thread);
 #elif 0
             threadwise_direct_convolution_3(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                               c_thread_data,
+                                                               hi_thread_data_begin,
+                                                               wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
-                    wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                    wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                 out_nkhw_thread_desc,
                 p_out_thread);
 #endif

From 2832520418fe81185b1a44f8421ec32f2efc1714 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 18 Mar 2019 15:09:52 -0500
Subject: [PATCH 06/13] adding fp16 direct that reads pre-vectorized data

---
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 2b3cb03b78..5901c42e55 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -27,8 +27,8 @@ template <class Float,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_in_global,
-    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_wei_global,
+    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_in_vec_global,
+    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_wei_vec_global,
     Float* const __restrict__ p_out_global)
 {
     using scalar_t = Float;
@@ -76,25 +76,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                        ? InBlockCopyDataPerRead
                                        : WeiBlockCopyDataPerRead;
 
-    __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+    __shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // threadwise tensors
     constexpr unsigned HiPerThread = HoPerThread + Y - 1;
     constexpr unsigned WiPerThread = WoPerThread + X - 1;
 
-    constexpr auto in_nchw_thread_block_desc =
+    constexpr auto in_nchw_vec_thread_block_desc =
         make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
                                       in_nchw_vec_block_desc.GetStrides());
 
-    constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
+    constexpr auto wei_kcyx_vec_thread_block_desc = make_ConstantTensorDescriptor(
         Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_vec_block_desc.GetStrides());
 
     constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(
-        in_nchw_thread_block_desc, wei_kcyx_thread_block_desc);
+        in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
 
     // register
-    Float p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
+    scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
 
     // divide block work
     constexpr unsigned NBlockWork =
@@ -150,7 +150,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               Float,
+                               vector_t,
                                decltype(in_nchw_vec_global_desc),
                                decltype(in_nchw_vec_block_desc),
                                decltype(in_nchw_vec_block_desc.GetLengths()),
@@ -159,7 +159,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               Float,
+                               vector_t,
                                decltype(wei_kcyx_vec_global_desc),
                                decltype(wei_kcyx_vec_block_desc),
                                decltype(wei_kcyx_vec_block_desc.GetLengths()),
@@ -167,7 +167,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 1
     const auto blockwise_wei_copy =
         Blockwise2dTensorCopy3<BlockSize,
-                               Float,
+                               vector_t,
                                decltype(wei_ke_vec_global_desc),
                                decltype(wei_ke_vec_block_desc),
                                decltype(wei_ke_vec_block_desc.GetLengths()),
@@ -181,16 +181,16 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
+        blockwise_in_copy.Run(p_in_vec_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
                                                                                c_block_data_begin,
                                                                                hi_block_data_begin,
                                                                                wi_block_data_begin),
-                              p_in_block);
+                              p_in_vec_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_vec_global_desc.Get1dIndex(
+        blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
                                                   k_block_data_begin, c_block_data_begin, 0, 0),
-                               p_wei_block);
+                               p_wei_vec_block);
 
         __syncthreads();
 
@@ -199,25 +199,25 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 // threadwise convolution
 #if 1
             threadwise_direct_convolution_2(
-                in_nchw_thread_block_desc,
-                p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                in_nchw_vec_thread_block_desc,
+                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
                                                                c_thread_data,
                                                                hi_thread_data_begin,
                                                                wi_thread_data_begin),
-                wei_kcyx_thread_block_desc,
-                p_wei_block +
+                wei_kcyx_vec_thread_block_desc,
+                p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                 out_nkhw_thread_desc,
                 p_out_thread);
 #elif 0
             threadwise_direct_convolution_3(
-                in_nchw_thread_block_desc,
-                p_in_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                in_nchw_vec_thread_block_desc,
+                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
                                                                c_thread_data,
                                                                hi_thread_data_begin,
                                                                wi_thread_data_begin),
-                wei_kcyx_thread_block_desc,
-                p_wei_block +
+                wei_kcyx_vec_thread_block_desc,
+                p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                 out_nkhw_thread_desc,
                 p_out_thread);

From 79d9b1084b8f65fe6c261483276b791aeb918627 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 18 Mar 2019 18:16:02 -0500
Subject: [PATCH 07/13] adding fp16 direct that reads pre-vectorized data

---
 ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 49 +++++++---
 src/include/blockwise_2d_tensor_op.hip.hpp    |  2 +-
 src/include/blockwise_4d_tensor_op.hip.hpp    |  4 +-
 src/include/common.hip.hpp                    | 92 +------------------
 src/include/config.h.in                       |  2 -
 src/include/functional.hip.hpp                |  8 ++
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 46 ++++++----
 .../threadwise_direct_convolution.hip.hpp     | 55 ++++-------
 8 files changed, 92 insertions(+), 166 deletions(-)

diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
index 19633f0462..d16d05d978 100644
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -13,8 +13,8 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
                                                            unsigned nrepeat)
 {
     constexpr unsigned NVector = 1;
-    using vector_type_t        = vector_type<T, NVector>;
-    using vector_t             = typename vector_type_t::VectorType;
+    using vector_t             = vector_type<T, NVector>;
+    using vector_mem_t         = typename vector_t::MemoryType;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -41,40 +41,41 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
     ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
 
-    Tensor<vector_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
+    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
 
     auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
 #if 1
         in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
 #else
         in_nchw_vec(n, c, h, w) =
-            vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
+            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
 #endif
     };
 
-    make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)(
+    make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
         std::thread::hardware_concurrency());
 
     // vectorize weight
     auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
     ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
 
-    Tensor<vector_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
+    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
 
     auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
 #if 1
         wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
 #else
         wei_kcyx_vec(k, c, y, x) =
-            vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
+            vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
 #endif
     };
 
-    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
+        std::thread::hardware_concurrency());
 
     //
-    DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace());
-    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace());
+    DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
     DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace());
 
     in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
@@ -82,7 +83,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
 #if 1
-    // 3x3, 34x34, 128 thread
+    // 3x3, 34x34, 128 thread, fp32, vector = 1
     constexpr unsigned NPerBlock  = 2;
     constexpr unsigned KPerBlock  = 32;
     constexpr unsigned CPerBlock  = 4;
@@ -96,24 +97,42 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned WoPerThread = 2;
 
     constexpr unsigned InBlockCopyDataPerRead  = 2;
-    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
 #elif 1
-    // 3x3, 34x34, 128 thread, fp16
+    // 3x3, 34x34, 128 thread, fp32, vector = 2
     constexpr unsigned NPerBlock = 2;
     constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 4;
+    constexpr unsigned CPerBlock = 2;
     constexpr unsigned HoPerBlock = 2;
     constexpr unsigned WoPerBlock = 32;
 
     constexpr unsigned NPerThread = 2;
     constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
+    constexpr unsigned CPerThread = 1;
     constexpr unsigned HoPerThread = 2;
     constexpr unsigned WoPerThread = 2;
 
     constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // 3x3, 34x34, 128 thread, fp16
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 4;
 
     constexpr unsigned BlockSize = 128;
diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp
index 969f18c4e8..ce3a7a37b9 100644
--- a/src/include/blockwise_2d_tensor_op.hip.hpp
+++ b/src/include/blockwise_2d_tensor_op.hip.hpp
@@ -373,7 +373,7 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise2dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index e8829c0cbf..fa5f36be51 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -207,7 +207,7 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise4dTensorCopy1
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     __device__ constexpr Blockwise4dTensorCopy1()
     {
@@ -444,7 +444,7 @@ template <unsigned BlockSize,
           unsigned DataPerRead>
 struct Blockwise4dTensorCopy3
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::VectorType;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     unsigned mSrcMyThreadOffset;
     unsigned mDstMyThreadOffset;
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index aa7e2269f6..d5832dde9d 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -1,4 +1,5 @@
 #pragma once
+#include "data_type.hip.hpp"
 #include "constant_integral.hip.hpp"
 #include "Sequence.hip.hpp"
 #include "Array.hip.hpp"
@@ -20,97 +21,6 @@ struct is_same<T, T>
     static const bool value = true;
 };
 
-template <class T, unsigned N>
-struct vector_type
-{
-};
-
-template <>
-struct vector_type<float, 1>
-{
-    using VectorType = float;
-};
-
-template <>
-struct vector_type<float, 2>
-{
-    using VectorType = float2;
-};
-
-template <>
-struct vector_type<float, 4>
-{
-    using VectorType = float4;
-};
-
-#if 0
-template <>
-struct vector_type<half_float::half, 1>
-{
-    using VectorType = half_float::half;
-};
-
-template <>
-struct vector_type<half_float::half, 2>
-{
-    using VectorType = float;
-};
-
-template <>
-struct vector_type<half_float::half, 4>
-{
-    using VectorType = float2;
-};
-
-template <>
-struct vector_type<half_float::half, 8>
-{
-    using VectorType = float4;
-};
-#endif
-
-#if 1
-template <>
-struct vector_type<half, 1>
-{
-    using VectorType = half;
-
-    __host__ __device__ static VectorType pack(half s) { return s; }
-};
-
-template <>
-struct vector_type<half, 2>
-{
-    using VectorType = half2;
-
-    union Data
-    {
-        VectorType vector;
-        half scalar[2];
-    };
-
-    __host__ __device__ static VectorType pack(half s0, half s1)
-    {
-        Data data;
-        data.scalar[0] = s0;
-        data.scalar[1] = s1;
-        return data.vector;
-    }
-};
-
-template <>
-struct vector_type<half, 4>
-{
-    using VectorType = float2;
-};
-
-template <>
-struct vector_type<half, 8>
-{
-    using VectorType = float4;
-};
-#endif
-
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)
 {
diff --git a/src/include/config.h.in b/src/include/config.h.in
index 9ee0c41f80..7b888c6951 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,10 +4,8 @@
 
 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
-#include "half.hpp"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
-#include "cuda_fp16.h"
 #endif
diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp
index 598d5c3c71..d3f645eaae 100644
--- a/src/include/functional.hip.hpp
+++ b/src/include/functional.hip.hpp
@@ -47,3 +47,11 @@ struct static_const_reduce_n<1>
         return f(Number<0>{});
     }
 };
+
+#if 0
+template<class F>
+__host__ __device__ constexpr auto unpacker(F f)
+{
+    return [=](auto xs_array){ f(xs...); };
+}
+#endif
\ No newline at end of file
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 5901c42e55..825977ab54 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -27,12 +27,14 @@ template <class Float,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_in_vec_global,
-    const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_wei_vec_global,
+    const typename vector_type<Float,
+                               ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
+    const typename vector_type<Float,
+                               ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
     Float* const __restrict__ p_out_global)
 {
-    using scalar_t = Float;
-    using vector_t = typename vector_type<scalar_t, ScalarPerVector>::VectorType;
+    using scalar_t     = Float;
+    using vector_mem_t = typename vector_type<scalar_t, ScalarPerVector>::MemoryType;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -69,6 +71,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
     // shared mem
     constexpr unsigned in_block_size =
         in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
+
     constexpr unsigned wei_block_size =
         wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
 
@@ -76,8 +79,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                        ? InBlockCopyDataPerRead
                                        : WeiBlockCopyDataPerRead;
 
-    __shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
+    __shared__ vector_mem_t
+        p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
+    __shared__ vector_mem_t
+        p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // threadwise tensors
     constexpr unsigned HiPerThread = HoPerThread + Y - 1;
@@ -150,7 +155,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               vector_t,
+                               vector_mem_t,
                                decltype(in_nchw_vec_global_desc),
                                decltype(in_nchw_vec_block_desc),
                                decltype(in_nchw_vec_block_desc.GetLengths()),
@@ -159,7 +164,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               vector_t,
+                               vector_mem_t,
                                decltype(wei_kcyx_vec_global_desc),
                                decltype(wei_kcyx_vec_block_desc),
                                decltype(wei_kcyx_vec_block_desc.GetLengths()),
@@ -167,7 +172,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 1
     const auto blockwise_wei_copy =
         Blockwise2dTensorCopy3<BlockSize,
-                               vector_t,
+                               vector_mem_t,
                                decltype(wei_ke_vec_global_desc),
                                decltype(wei_ke_vec_block_desc),
                                decltype(wei_ke_vec_block_desc.GetLengths()),
@@ -181,15 +186,16 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_vec_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
-                                                                               c_block_data_begin,
-                                                                               hi_block_data_begin,
-                                                                               wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_vec_global +
+                                  in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
+                                                                     c_block_data_begin,
+                                                                     hi_block_data_begin,
+                                                                     wi_block_data_begin),
                               p_in_vec_block);
 
         // copy weight tensor to LDS
         blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
-                                                  k_block_data_begin, c_block_data_begin, 0, 0),
+                                                      k_block_data_begin, c_block_data_begin, 0, 0),
                                p_wei_vec_block);
 
         __syncthreads();
@@ -201,9 +207,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
             threadwise_direct_convolution_2(
                 in_nchw_vec_thread_block_desc,
                 p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                               c_thread_data,
-                                                               hi_thread_data_begin,
-                                                               wi_thread_data_begin),
+                                                                   c_thread_data,
+                                                                   hi_thread_data_begin,
+                                                                   wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -213,9 +219,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
             threadwise_direct_convolution_3(
                 in_nchw_vec_thread_block_desc,
                 p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                               c_thread_data,
-                                                               hi_thread_data_begin,
-                                                               wi_thread_data_begin),
+                                                                   c_thread_data,
+                                                                   hi_thread_data_begin,
+                                                                   wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp
index 32d446491b..79677b0f9b 100644
--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -2,13 +2,13 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 
 // optimized for scenario if p_in, p_wei, p_out are in register
-template <class Float, class InDesc, class WeiDesc, class OutDesc>
+template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_1(InDesc,
-                                                Float* const __restrict__ p_in,
+                                                TInWei* const __restrict__ p_in,
                                                 WeiDesc,
-                                                Float* const __restrict__ p_wei,
+                                                TInWei* const __restrict__ p_wei,
                                                 OutDesc,
-                                                Float* __restrict__ p_out)
+                                                TOut* __restrict__ p_out)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -51,25 +51,10 @@ __device__ void threadwise_direct_convolution_1(InDesc,
 
                                 const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
 
-                                p_out[out_index] += p_wei[wei_index] * p_in[in_index];
-
-#if 0
-                                //   if(threadIdx.x == 0)
-                                {
-                                    printf("threadwise_direct_convolution: \t"
-                                           "threadIdx.x %u\t"
-                                           "out_index %u, p_out[out_index] %f, \t"
-                                           "wei_index %u, p_wei[wei_index] %f, \t"
-                                           "in_index %u, p_in[in_index] %f\n",
-                                           threadIdx.x,
-                                           out_index,
-                                           p_out[out_index],
-                                           wei_index,
-                                           p_wei[wei_index],
-                                           in_index,
-                                           p_in[in_index]);
-                                }
-#endif
+                                fused_multiply_add(p_out[out_index],
+                                                   p_wei[wei_index],
+                                                   p_in[in_index],
+                                                   p_out[out_index]);
                             }
                         }
                     }
@@ -81,13 +66,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
 
 // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
 // Copy in and wei into register before doing convolution
-template <class Float, class InDesc, class WeiDesc, class OutDesc>
+template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_2(InDesc,
-                                                Float* const __restrict__ p_in,
+                                                TInWei* const __restrict__ p_in,
                                                 WeiDesc,
-                                                Float* const __restrict__ p_wei,
+                                                TInWei* const __restrict__ p_wei,
                                                 OutDesc,
-                                                Float* __restrict__ p_out)
+                                                TOut* __restrict__ p_out)
 {
     constexpr auto in_desc  = InDesc{};
     constexpr auto wei_desc = WeiDesc{};
@@ -97,8 +82,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
     constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
 
     // register
-    Float p_in_reg[in_reg_desc.GetElementSpace()];
-    Float p_wei_reg[wei_reg_desc.GetElementSpace()];
+    TInWei p_in_reg[in_reg_desc.GetElementSpace()];
+    TInWei p_wei_reg[wei_reg_desc.GetElementSpace()];
 
     // copy input tensor into register
     threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths());
@@ -114,13 +99,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
 // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
 // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
 // load 1x1 weight into register, and do 1x1 convolution in register.
-template <class Float, class InDesc, class WeiDesc, class OutDesc>
+template <class Data, class InDesc, class WeiDesc, class OutDesc>
 __device__ void threadwise_direct_convolution_3(InDesc,
-                                                Float* const __restrict__ p_in,
+                                                Data* const __restrict__ p_in,
                                                 WeiDesc,
-                                                Float* const __restrict__ p_wei,
+                                                Data* const __restrict__ p_wei,
                                                 OutDesc,
-                                                Float* __restrict__ p_out)
+                                                Data* __restrict__ p_out)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -139,8 +124,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
     constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
         Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
 
-    Float p_in_reg[in_reg_desc.GetElementSpace()];
-    Float p_wei_reg[wei_reg_desc.GetElementSpace()];
+    Data p_in_reg[in_reg_desc.GetElementSpace()];
+    Data p_wei_reg[wei_reg_desc.GetElementSpace()];
 
     constexpr unsigned in_w_new_read = 1;
 

From 18ffbd680273c5970ff1d105c1ee6fca99e0df88 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 18 Mar 2019 18:16:16 -0500
Subject: [PATCH 08/13] adding fp16 direct that reads pre-vectorized data

---
 src/include/data_type.hip.hpp | 157 ++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 src/include/data_type.hip.hpp

diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
new file mode 100644
index 0000000000..fb0bdabee3
--- /dev/null
+++ b/src/include/data_type.hip.hpp
@@ -0,0 +1,157 @@
+#pragma once
+#include "config.h"
+
+#if DEVICE_BACKEND_CUDA
+namespace CUDA {
+#include "cuda_fp16.h"
+}
+#endif
+
+using half  = CUDA::half;
+using half2 = CUDA::half2;
+
+struct half4
+{
+    half data[4];
+};
+
+struct half8
+{
+    half data[8];
+};
+
+template <class T, unsigned N>
+struct vector_type
+{
+};
+
+template <>
+struct vector_type<float, 1>
+{
+    using MemoryType = float;
+};
+
+template <>
+struct vector_type<float, 2>
+{
+    using MemoryType = float2;
+
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        union
+        {
+            MemoryType vector;
+            float scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using MemoryType = float4;
+};
+
+template <>
+struct vector_type<float2, 2>
+{
+    using MemoryType = float4;
+};
+
+template <>
+struct vector_type<half, 1>
+{
+    using MemoryType = half;
+
+    __host__ __device__ static MemoryType Pack(half s) { return s; }
+};
+
+template <>
+struct vector_type<half, 2>
+{
+    using MemoryType = half2;
+
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        union
+        {
+            MemoryType vector;
+            half scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<half, 4>
+{
+    using MemoryType = float2;
+};
+
+template <>
+struct vector_type<half, 8>
+{
+    using MemoryType = float4;
+};
+
+template <>
+struct vector_type<half2, 1>
+{
+    using MemoryType = half2;
+};
+
+template <>
+struct vector_type<half2, 2>
+{
+    using MemoryType = float2;
+};
+
+template <>
+struct vector_type<half2, 4>
+{
+    using MemoryType = float4;
+};
+
+template <class TDst, class TSrc0, class TSrc1, class TSrc2>
+__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2)
+{
+    printf("should not call into base");
+    assert(false);
+}
+
+template <>
+__device__ void fused_multiply_add(float& d, float s0, float s1, float s2)
+{
+    d = s0 * s1 + s2;
+}
+
+template <>
+__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2)
+{
+    d = s0.x * s1.x + s0.y * s1.y + s2;
+}
+
+template <>
+__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2)
+{
+    d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2;
+}
+
+template <>
+__device__ void fused_multiply_add(half& d, half s0, half s1, half s2)
+{
+    d = s0 * s1 + s2;
+}
+
+template <>
+__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2)
+{
+    d = s0.x * s1.x + s0.y * s1.y + s2;
+}
\ No newline at end of file

From 050a1a6890007d577f9c83cef6e576a76788004a Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 19 Mar 2019 00:05:41 -0500
Subject: [PATCH 09/13] adding int8 direct that reads pre-vectorized data

---
 ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp |  71 ++++++---
 driver/driver.hip.cpp                         |  65 ++++++---
 src/include/data_type.hip.hpp                 | 138 +++++++++++++++---
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp |  30 ++--
 src/include/threadwise_4d_tensor_op.hip.hpp   |  32 ++--
 .../threadwise_direct_convolution.hip.hpp     |   6 +-
 6 files changed, 247 insertions(+), 95 deletions(-)

diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
index d16d05d978..5c687f4206 100644
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -3,17 +3,18 @@
 #include "device.hpp"
 #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp"
 
-template <class T, class InDesc, class WeiDesc, class OutDesc>
+template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
-                                                           const Tensor<T>& in_nchw,
+                                                           const Tensor<TInWei>& in_nchw,
                                                            WeiDesc,
-                                                           const Tensor<T>& wei_kcyx,
+                                                           const Tensor<TInWei>& wei_kcyx,
                                                            OutDesc,
-                                                           Tensor<T>& out_nkhw,
+                                                           Tensor<TOut>& out_nkhw,
                                                            unsigned nrepeat)
 {
-    constexpr unsigned NVector = 1;
-    using vector_t             = vector_type<T, NVector>;
+    constexpr unsigned NVector = 4;
+    using accum_t              = int32_t;
+    using vector_t             = vector_type<TInWei, NVector>;
     using vector_mem_t         = typename vector_t::MemoryType;
 
     constexpr auto I0 = Number<0>{};
@@ -44,11 +45,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
 
     auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
-#if 1
+#if 0
         in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
-#else
+#elif 0
         in_nchw_vec(n, c, h, w) =
             vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
+#elif 1
+        in_nchw_vec(n, c, h, w)  = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+                                                 in_nchw(n, 4 * c + 1, h, w),
+                                                 in_nchw(n, 4 * c + 2, h, w),
+                                                 in_nchw(n, 4 * c + 3, h, w));
 #endif
     };
 
@@ -62,11 +68,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
 
     auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
-#if 1
+#if 0
         wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
-#else
+#elif 0
         wei_kcyx_vec(k, c, y, x) =
             vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
+#elif 1
+        wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
+                                                  wei_kcyx(k, 4 * c + 1, y, x),
+                                                  wei_kcyx(k, 4 * c + 2, y, x),
+                                                  wei_kcyx(k, 4 * c + 3, y, x));
 #endif
     };
 
@@ -76,13 +87,13 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     //
     DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
     DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
-    DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
 
     in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
     wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
     out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 
-#if 1
+#if 0
     // 3x3, 34x34, 128 thread, fp32, vector = 1
     constexpr unsigned NPerBlock  = 2;
     constexpr unsigned KPerBlock  = 32;
@@ -100,7 +111,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // 3x3, 34x34, 128 thread, fp32, vector = 2
     constexpr unsigned NPerBlock = 2;
     constexpr unsigned KPerBlock = 32;
@@ -117,9 +128,27 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned InBlockCopyDataPerRead = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    // 3x3, 34x34, 128 thread, int8, vector = 4
+    constexpr unsigned NPerBlock = 2;
+    constexpr unsigned KPerBlock = 32;
+    constexpr unsigned CPerBlock = 4;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 32;
+
+    constexpr unsigned NPerThread = 2;
+    constexpr unsigned KPerThread = 4;
+    constexpr unsigned CPerThread = 1;
+    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned WoPerThread = 2;
+
+    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
+
     constexpr unsigned BlockSize = 128;
 #elif 1
-    // 3x3, 34x34, 128 thread, fp16
+    // 1x1, 32x32, 128 thread, int8, vector = 4
     constexpr unsigned NPerBlock  = 2;
     constexpr unsigned KPerBlock  = 32;
     constexpr unsigned CPerBlock  = 4;
@@ -128,12 +157,12 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
 
     constexpr unsigned NPerThread  = 2;
     constexpr unsigned KPerThread  = 4;
-    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned CPerThread  = 1;
     constexpr unsigned HoPerThread = 2;
     constexpr unsigned WoPerThread = 2;
 
     constexpr unsigned InBlockCopyDataPerRead  = 2;
-    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+    constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
 #endif
@@ -146,7 +175,9 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     for(unsigned i = 0; i < nrepeat; ++i)
     {
         float time = launch_kernel(
-            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<T,
+            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
+                                                                    TOut,
+                                                                    accum_t,
                                                                     decltype(in_nchw_vec_desc),
                                                                     decltype(wei_kcyx_vec_desc),
                                                                     decltype(out_nkhw_desc),
@@ -167,9 +198,9 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
                                                                     GridSize>,
             dim3(GridSize),
             dim3(BlockSize),
-            static_cast<T*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
-            static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+            static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
 
         printf("Elapsed time : %f ms\n", time);
         usleep(std::min(time * 1000, float(10000)));
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 47ff1b6882..17b333c69a 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -88,9 +88,12 @@ auto make_TensorDescriptor(TConstTensorDesc)
     return TensorDescriptor(lengths, strides);
 }
 
-template <class T, class LowerPads, class UpperPads>
-void host_direct_convolution(
-    const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
+template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
+void host_direct_convolution(const Tensor<TIn>& in_nchw,
+                             const Tensor<TWei>& wei_kcyx,
+                             Tensor<TOut>& out_nkhw,
+                             LowerPads,
+                             UpperPads)
 {
     unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
     unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -116,21 +119,24 @@ void host_direct_convolution(
                 }
             }
         }
-        out(n, k, ho, wo) = v;
+        out_nkhw(n, k, ho, wo) = v;
     };
 
     auto f_par = make_ParallelTensorFunctor(f,
-                                            out.mDesc.GetLengths()[0],
-                                            out.mDesc.GetLengths()[1],
-                                            out.mDesc.GetLengths()[2],
-                                            out.mDesc.GetLengths()[3]);
+                                            out_nkhw.mDesc.GetLengths()[0],
+                                            out_nkhw.mDesc.GetLengths()[1],
+                                            out_nkhw.mDesc.GetLengths()[2],
+                                            out_nkhw.mDesc.GetLengths()[3]);
 
     f_par(std::thread::hardware_concurrency());
 }
 
-template <class T, class LowerPads, class UpperPads>
-void host_winograd_3x3_convolution(
-    const Tensor<T>& in_nchw, const Tensor<T>& wei_kcyx, Tensor<T>& out, LowerPads, UpperPads)
+template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
+void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
+                                   const Tensor<TWei>& wei_kcyx,
+                                   Tensor<TOut>& out_nkhw,
+                                   LowerPads,
+                                   UpperPads)
 {
     constexpr std::size_t HoPerTile = 2;
     constexpr std::size_t WoPerTile = 2;
@@ -144,8 +150,8 @@ void host_winograd_3x3_convolution(
     std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
     std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
 
-    std::size_t HO = out.mDesc.GetLengths()[2];
-    std::size_t WO = out.mDesc.GetLengths()[3];
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
 
     unsigned h_pad_low = LowerPads{}.Get(Number<0>{});
     unsigned w_pad_low = LowerPads{}.Get(Number<1>{});
@@ -180,7 +186,7 @@ void host_winograd_3x3_convolution(
                 }
                 else
                 {
-                    in_hold(n, c, htile, wtile, j, i) = T(0);
+                    in_hold(n, c, htile, wtile, j, i) = TIn(0);
                 }
             }
         }
@@ -347,8 +353,8 @@ void host_winograd_3x3_convolution(
             std::size_t ho = HoPerTile * htile + j;
             for(int i = 0; i < WoPerTile; ++i)
             {
-                std::size_t wo    = WoPerTile * wtile + i;
-                out(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
+                std::size_t wo         = WoPerTile * wtile + i;
+                out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
             }
         }
     };
@@ -403,7 +409,7 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
-#elif 1
+#elif 0
     // 3x3, 34x34
     constexpr unsigned N  = 64;
     constexpr unsigned C  = 256;
@@ -502,7 +508,7 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 1;
     constexpr unsigned WPad = 1;
-#elif 1
+#elif 0
     // 1x1 filter, 28x28 image
     constexpr unsigned N  = 16;
     constexpr unsigned C  = 256;
@@ -562,6 +568,18 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 2;
     constexpr unsigned WPad = 2;
+#elif 1
+    // 1x1 filter, 32x32 image
+    constexpr unsigned N  = 64;
+    constexpr unsigned C  = 256;
+    constexpr unsigned HI = 32;
+    constexpr unsigned WI = 32;
+    constexpr unsigned K  = 512;
+    constexpr unsigned Y  = 1;
+    constexpr unsigned X  = 1;
+
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
 #endif
 
     auto lower_pads = Sequence<HPad, WPad>{};
@@ -576,11 +594,12 @@ int main(int argc, char* argv[])
     ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
     ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
 
-    using Float = float;
-    Tensor<Float> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<Float> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<Float> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<Float> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
+    using in_data_t  = char;
+    using out_data_t = int32_t;
+    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
+    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
 
     std::size_t num_thread = std::thread::hardware_concurrency();
 
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index fb0bdabee3..bb60adf154 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -10,16 +10,6 @@ namespace CUDA {
 using half  = CUDA::half;
 using half2 = CUDA::half2;
 
-struct half4
-{
-    half data[4];
-};
-
-struct half8
-{
-    half data[8];
-};
-
 template <class T, unsigned N>
 struct vector_type
 {
@@ -119,39 +109,141 @@ struct vector_type<half2, 4>
     using MemoryType = float4;
 };
 
-template <class TDst, class TSrc0, class TSrc1, class TSrc2>
-__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2)
+template <>
+struct vector_type<char, 1>
 {
+    using MemoryType = char;
+
+    __host__ __device__ static MemoryType Pack(char s) { return s; }
+};
+
+template <>
+struct vector_type<char, 2>
+{
+    using MemoryType = char2;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[2];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 4>
+{
+    using MemoryType = char4;
+
+    __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
+    {
+        union
+        {
+            MemoryType vector;
+            char scalar[4];
+        } data;
+
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+
+template <>
+struct vector_type<char, 8>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char2, 2>
+{
+    using MemoryType = char4;
+};
+
+template <>
+struct vector_type<char2, 4>
+{
+    using MemoryType = int64_t;
+};
+
+template <>
+struct vector_type<char4, 2>
+{
+    using MemoryType = int64_t;
+};
+
+template <class TDst, class TSrc0, class TSrc1>
+__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1)
+{
+    // static_assert(false, "should not call into base");
     printf("should not call into base");
     assert(false);
 }
 
 template <>
-__device__ void fused_multiply_add(float& d, float s0, float s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
-    d = s0 * s1 + s2;
+    d += s0 * s1;
 }
 
 template <>
-__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s2;
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
 }
 
 template <>
-__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2)
+__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2;
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+    d += s0.z * s1.z;
+    d += s0.w * s1.w;
 }
 
 template <>
-__device__ void fused_multiply_add(half& d, half s0, half s1, half s2)
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1)
 {
-    d = s0 * s1 + s2;
+    d += s0 * s1;
 }
 
 template <>
-__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2)
+__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
 {
-    d = s0.x * s1.x + s0.y * s1.y + s2;
-}
\ No newline at end of file
+    d += s0.x * s1.x;
+    d += s0.y * s1.y;
+}
+
+#if 0
+template <>
+__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
+{
+    d += s0.x * s1.x + s0.y * s1.y;
+}
+#endif
+
+template <>
+__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1)
+{
+    d += s0 * s1;
+}
+
+template <>
+__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1)
+{
+#if DEVICE_BACKEND_CUDA
+    d = __dp4a(s0, s1, d);
+#else
+    d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w;
+#endif
+}
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 825977ab54..7ae594788b 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -7,7 +7,9 @@
 #include "threadwise_4d_tensor_op.hip.hpp"
 #include "threadwise_direct_convolution.hip.hpp"
 
-template <class Float,
+template <class TInWei,
+          class TOut,
+          class TAccum,
           class InGlobalDesc,
           class WeiGlobalDesc,
           class OutGlobalDesc,
@@ -27,14 +29,16 @@ template <class Float,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
-    const typename vector_type<Float,
+    const typename vector_type<TInWei,
                                ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
-    const typename vector_type<Float,
+    const typename vector_type<TInWei,
                                ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
-    Float* const __restrict__ p_out_global)
+    TOut* const __restrict__ p_out_global)
 {
-    using scalar_t     = Float;
-    using vector_mem_t = typename vector_type<scalar_t, ScalarPerVector>::MemoryType;
+    using in_scalar_t     = TInWei;
+    using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
+    using out_scalar_t    = TOut;
+    using accum_t         = TAccum;
 
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -79,9 +83,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                                        ? InBlockCopyDataPerRead
                                        : WeiBlockCopyDataPerRead;
 
-    __shared__ vector_mem_t
+    __shared__ in_vector_mem_t
         p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
-    __shared__ vector_mem_t
+    __shared__ in_vector_mem_t
         p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
 
     // threadwise tensors
@@ -99,7 +103,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
 
     // register
-    scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
+    out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
 
     // divide block work
     constexpr unsigned NBlockWork =
@@ -155,7 +159,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 
     constexpr auto blockwise_in_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                                decltype(in_nchw_vec_global_desc),
                                decltype(in_nchw_vec_block_desc),
                                decltype(in_nchw_vec_block_desc.GetLengths()),
@@ -164,7 +168,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 0
     constexpr auto blockwise_wei_copy =
         Blockwise4dTensorCopy1<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                                decltype(wei_kcyx_vec_global_desc),
                                decltype(wei_kcyx_vec_block_desc),
                                decltype(wei_kcyx_vec_block_desc.GetLengths()),
@@ -172,15 +176,17 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 1
     const auto blockwise_wei_copy =
         Blockwise2dTensorCopy3<BlockSize,
-                               vector_mem_t,
+                               in_vector_mem_t,
                                decltype(wei_ke_vec_global_desc),
                                decltype(wei_ke_vec_block_desc),
                                decltype(wei_ke_vec_block_desc.GetLengths()),
                                WeiBlockCopyDataPerRead>{};
 #endif
 
+#if 1 // debug
     // set threadwise output tensor to 0
     threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
+#endif
 
     for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
         c_block_data_begin += CPerBlock, __syncthreads())
diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp
index 3d13ae2aa6..5b908d3ac6 100644
--- a/src/include/threadwise_4d_tensor_op.hip.hpp
+++ b/src/include/threadwise_4d_tensor_op.hip.hpp
@@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
 
 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <class Float,
+template <class SrcData,
+          class DstData,
           class SrcDesc,
           class DstDesc,
           class SrcOpLengths,
@@ -45,9 +46,9 @@ template <class Float,
           class F>
 __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
     SrcDesc,
-    const Float* __restrict__ p_src,
+    const SrcData* __restrict__ p_src,
     DstDesc,
-    Float* __restrict__ p_dst,
+    DstData* __restrict__ p_dst,
     SrcOpLengths,
     DstFromSrcReorder,
     F f)
@@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d
     }
 }
 
-template <class Float, class Desc>
-__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
+template <class Data, class Desc>
+__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p)
 {
-    auto f_set_zero = [](Float& v) { v = Float(0); };
+    auto f_set_zero = [](Data& v) { v = Data(0); };
 
-    threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
+    threadwise_4d_tensor_pointwise_operation_unary<Data, Desc, decltype(f_set_zero)>(
         Desc{}, p, f_set_zero);
 }
 
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, class DstFromSrcReorder>
+template <class SrcData,
+          class DstData,
+          class SrcDesc,
+          class DstDesc,
+          class SrcOpLengths,
+          class DstFromSrcReorder>
 __device__ void
 threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
-                                                      const Float* __restrict__ p_src,
+                                                      const SrcData* __restrict__ p_src,
                                                       DstDesc,
-                                                      Float* __restrict__ p_dst,
+                                                      DstData* __restrict__ p_dst,
                                                       SrcOpLengths,
                                                       DstFromSrcReorder)
 {
-    auto f_copy = [](const Float& src, Float& dst) { dst = src; };
+    auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast<DstData>(src); };
 
     threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
         SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
 }
 
-template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
+template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
 __device__ void threadwise_4d_tensor_copy(
-    SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
+    SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
 {
     auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
 
diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp
index 79677b0f9b..b9a509d6a0 100644
--- a/src/include/threadwise_direct_convolution.hip.hpp
+++ b/src/include/threadwise_direct_convolution.hip.hpp
@@ -51,10 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc,
 
                                 const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
 
-                                fused_multiply_add(p_out[out_index],
-                                                   p_wei[wei_index],
-                                                   p_in[in_index],
-                                                   p_out[out_index]);
+                                fused_multiply_accumulate(
+                                    p_out[out_index], p_wei[wei_index], p_in[in_index]);
                             }
                         }
                     }

From 02d72160dc22e364d1ceb65ca4f59892c097946a Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 19 Mar 2019 01:30:28 -0500
Subject: [PATCH 10/13] adding int8 direct that reads pre-vectorized data

---
 ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 28 +++++-----
 src/include/data_type.hip.hpp                 | 56 +++++++++----------
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
index 5c687f4206..b25459c30d 100644
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -133,14 +133,14 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     // 3x3, 34x34, 128 thread, int8, vector = 4
     constexpr unsigned NPerBlock = 2;
     constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 4;
-    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned CPerBlock = 8;
+    constexpr unsigned HoPerBlock = 4;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 1;
-    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned NPerThread = 1;
+    constexpr unsigned KPerThread = 8;
+    constexpr unsigned CPerThread = 2;
+    constexpr unsigned HoPerThread = 4;
     constexpr unsigned WoPerThread = 2;
 
     constexpr unsigned InBlockCopyDataPerRead = 2;
@@ -149,16 +149,16 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned BlockSize = 128;
 #elif 1
     // 1x1, 32x32, 128 thread, int8, vector = 4
-    constexpr unsigned NPerBlock  = 2;
-    constexpr unsigned KPerBlock  = 32;
-    constexpr unsigned CPerBlock  = 4;
-    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned NPerBlock  = 1;
+    constexpr unsigned KPerBlock  = 64;
+    constexpr unsigned CPerBlock  = 16;
+    constexpr unsigned HoPerBlock = 4;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread  = 2;
-    constexpr unsigned KPerThread  = 4;
-    constexpr unsigned CPerThread  = 1;
-    constexpr unsigned HoPerThread = 2;
+    constexpr unsigned NPerThread  = 1;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned CPerThread  = 2;
+    constexpr unsigned HoPerThread = 4;
     constexpr unsigned WoPerThread = 2;
 
     constexpr unsigned InBlockCopyDataPerRead  = 2;
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index bb60adf154..c32e93b6ef 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -120,7 +120,7 @@ struct vector_type<char, 1>
 template <>
 struct vector_type<char, 2>
 {
-    using MemoryType = char2;
+    using MemoryType = int16_t;
 
     __host__ __device__ static MemoryType Pack(char s0, char s1)
     {
@@ -139,7 +139,7 @@ struct vector_type<char, 2>
 template <>
 struct vector_type<char, 4>
 {
-    using MemoryType = char4;
+    using MemoryType = int32_t;
 
     __host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
     {
@@ -163,6 +163,13 @@ struct vector_type<char, 8>
     using MemoryType = int64_t;
 };
 
+template <>
+struct vector_type<int32_t, 2>
+{
+    using MemoryType = int64_t;
+};
+
+#if 0
 template <>
 struct vector_type<char2, 2>
 {
@@ -175,34 +182,30 @@ struct vector_type<char2, 4>
     using MemoryType = int64_t;
 };
 
+template <>
+struct vector_type<char4, 1>
+{
+    using MemoryType = int;
+};
+
 template <>
 struct vector_type<char4, 2>
 {
     using MemoryType = int64_t;
 };
+#endif
 
-template <class TDst, class TSrc0, class TSrc1>
-__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1)
-{
-    // static_assert(false, "should not call into base");
-    printf("should not call into base");
-    assert(false);
-}
-
-template <>
 __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
 {
     d += s0 * s1;
 }
 
-template <>
 __device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
 {
     d += s0.x * s1.x;
     d += s0.y * s1.y;
 }
 
-template <>
 __device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
 {
     d += s0.x * s1.x;
@@ -211,13 +214,8 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa
     d += s0.w * s1.w;
 }
 
-template <>
-__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1)
-{
-    d += s0 * s1;
-}
+__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
 
-template <>
 __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
 {
     d += s0.x * s1.x;
@@ -225,25 +223,25 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2&
 }
 
 #if 0
-template <>
 __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
 {
     d += s0.x * s1.x + s0.y * s1.y;
 }
 #endif
 
-template <>
-__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1)
-{
-    d += s0 * s1;
-}
+__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
 
-template <>
-__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1)
+// TODO:: this interface is misleading, int32 is actually int8x4
+//  need to make a better interface
+__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
 {
 #if DEVICE_BACKEND_CUDA
+#if 1 // debug
     d = __dp4a(s0, s1, d);
-#else
-    d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w;
+#elif 1
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(s0), "r"(s1), "r"(d));
+#elif 0 // this is wrong! just for debugging
+    d += (*reinterpret_cast<const int32_t*>(&s0)) * (*reinterpret_cast<const int32_t*>(&s1));
+#endif
 #endif
 }

From e72eece8fcf79d1d3a958089fca1f02bfb71b777 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 21 Mar 2019 09:59:40 -0500
Subject: [PATCH 11/13] added int8x4

---
 driver/driver.hip.cpp         | 2 +-
 src/include/data_type.hip.hpp | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index 17b333c69a..b1df58265e 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -617,7 +617,7 @@ int main(int argc, char* argv[])
 #if 0
         in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-#elif 1
+#elif 0
         in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
         wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #elif 1
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index c32e93b6ef..ca1f4dcbae 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -231,17 +231,11 @@ __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2
 
 __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
 
-// TODO:: this interface is misleading, int32 is actually int8x4
+// TODO:: this interface is misleading, s0, s1 are actually int8x4
 //  need to make a better interface
 __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
 {
 #if DEVICE_BACKEND_CUDA
-#if 1 // debug
     d = __dp4a(s0, s1, d);
-#elif 1
-    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(s0), "r"(s1), "r"(d));
-#elif 0 // this is wrong! just for debugging
-    d += (*reinterpret_cast<const int32_t*>(&s0)) * (*reinterpret_cast<const int32_t*>(&s1));
-#endif
 #endif
 }

From 8c923db423ab4ca0a7ac10310cff3528b38bb520 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 22 Mar 2019 14:22:58 -0500
Subject: [PATCH 12/13] hip build

---
 ...onvolution_2_vectorized_nchw_kcyx_nkhw.hpp | 30 ++++++-------
 ...icit_gemm_convolution_1_chwn_cyxk_khwn.hpp | 37 +++++++++++++++-
 driver/driver.hip.cpp                         | 32 +++++++++-----
 src/include/Array.hip.hpp                     |  2 +-
 src/include/blockwise_4d_tensor_op.hip.hpp    |  9 ++--
 .../blockwise_direct_convolution.hip.hpp      | 27 ++++++------
 src/include/blockwise_gemm.hip.hpp            | 29 +++++++------
 src/include/common.hip.hpp                    |  2 +
 src/include/config.h.in                       |  2 +
 src/include/data_type.hip.hpp                 | 33 ++------------
 .../gridwise_direct_convolution_1.hip.hpp     | 19 ++++----
 ...irect_convolution_2_nchw_kcyx_nkhw.hip.hpp | 43 +++++++++++--------
 ...lution_2_vectorized_nchw_kcyx_nkhw.hip.hpp | 34 ++++++++-------
 ..._gemm_convolution_1_chwn_cyxk_khwn.hip.hpp | 28 ++++++------
 ...onvolution_1_chwn_cyxk_khwn_padded.hip.hpp |  9 ++--
 ..._gemm_convolution_2_chwn_cyxk_khwn.hip.hpp |  4 +-
 ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp |  4 +-
 src/include/tensor.hpp                        |  3 +-
 src/include/threadwise_nd_tensor_op.hip.hpp   |  4 +-
 19 files changed, 196 insertions(+), 155 deletions(-)

diff --git a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
index b25459c30d..d8bb3b768e 100644
--- a/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
         in_nchw_vec(n, c, h, w) =
             vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
 #elif 1
-        in_nchw_vec(n, c, h, w)  = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
                                                  in_nchw(n, 4 * c + 1, h, w),
                                                  in_nchw(n, 4 * c + 2, h, w),
                                                  in_nchw(n, 4 * c + 3, h, w));
@@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
     constexpr unsigned BlockSize = 128;
 #elif 0
     // 3x3, 34x34, 128 thread, fp32, vector = 2
-    constexpr unsigned NPerBlock = 2;
-    constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 2;
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 2;
     constexpr unsigned HoPerBlock = 2;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 1;
+    constexpr unsigned NPerThread  = 2;
+    constexpr unsigned KPerThread  = 4;
+    constexpr unsigned CPerThread  = 1;
     constexpr unsigned HoPerThread = 2;
     constexpr unsigned WoPerThread = 2;
 
-    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
 #elif 0
     // 3x3, 34x34, 128 thread, int8, vector = 4
-    constexpr unsigned NPerBlock = 2;
-    constexpr unsigned KPerBlock = 32;
-    constexpr unsigned CPerBlock = 8;
+    constexpr unsigned NPerBlock  = 2;
+    constexpr unsigned KPerBlock  = 32;
+    constexpr unsigned CPerBlock  = 8;
     constexpr unsigned HoPerBlock = 4;
     constexpr unsigned WoPerBlock = 32;
 
-    constexpr unsigned NPerThread = 1;
-    constexpr unsigned KPerThread = 8;
-    constexpr unsigned CPerThread = 2;
+    constexpr unsigned NPerThread  = 1;
+    constexpr unsigned KPerThread  = 8;
+    constexpr unsigned CPerThread  = 2;
     constexpr unsigned HoPerThread = 4;
     constexpr unsigned WoPerThread = 2;
 
-    constexpr unsigned InBlockCopyDataPerRead = 2;
+    constexpr unsigned InBlockCopyDataPerRead  = 2;
     constexpr unsigned WeiBlockCopyDataPerRead = 2;
 
     constexpr unsigned BlockSize = 128;
diff --git a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
index 2c27080670..99a6eb45c1 100644
--- a/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
+++ b/driver/device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp
@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
     out_khwn_device_buf.ToDevice(out_khwn.mData.data());
 
-#if 1
+#if 0
     // for 3x3, 34x34
     constexpr unsigned NPerBlock  = 16;
     constexpr unsigned KPerBlock  = 64;
@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
     constexpr unsigned WoPerThread = 1;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     // for 1x1, 28x28
     constexpr unsigned NPerBlock  = 16;
     constexpr unsigned KPerBlock  = 128;
@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
 
     constexpr unsigned OutThreadCopyDataPerWrite = 2;
 
+    constexpr unsigned BlockSize = 128;
+#elif 1
+    // for 1x1, 14x14
+    constexpr unsigned NPerBlock  = 16;
+    constexpr unsigned KPerBlock  = 128;
+    constexpr unsigned CPerBlock  = 8;
+    constexpr unsigned HoPerBlock = 2;
+    constexpr unsigned WoPerBlock = 2;
+
+    constexpr unsigned NPerThread  = 4;
+    constexpr unsigned KPerThread  = 16;
+    constexpr unsigned CPerThread  = 1;
+    constexpr unsigned HoPerThread = 1;
+    constexpr unsigned WoPerThread = 1;
+
+    constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
+    constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
+    constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
+    constexpr unsigned InBlockCopyDataPerRead    = 4;
+
+    constexpr unsigned WeiBlockCopyDataPerRead = 4;
+
+    constexpr unsigned GemmMPerThreadSubC = 4;
+    constexpr unsigned GemmNPerThreadSubC = 4;
+    constexpr unsigned GemmMLevel0Cluster = 4;
+    constexpr unsigned GemmNLevel0Cluster = 2;
+    constexpr unsigned GemmMLevel1Cluster = 2;
+    constexpr unsigned GemmNLevel1Cluster = 4;
+    constexpr unsigned GemmKPerThreadLoop = 1;
+
+    constexpr unsigned OutThreadCopyDataPerWrite = 2;
+
     constexpr unsigned BlockSize = 128;
 #endif
 
diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp
index b1df58265e..d391ec5b5f 100644
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -8,11 +8,11 @@
 #include "ConstantTensorDescriptor.hip.hpp"
 #include "conv_common.hip.hpp"
 //#include "device_direct_convolution_1.hpp"
-#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
-#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
-//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
+//#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
+//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
 //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
-//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
+#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
 
 struct GeneratorTensor_1
 {
@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
             std::size_t ho = HoPerTile * htile + j;
             for(int i = 0; i < WoPerTile; ++i)
             {
-                std::size_t wo         = WoPerTile * wtile + i;
+                std::size_t wo = WoPerTile * wtile + i;
                 out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
             }
         }
@@ -568,7 +568,7 @@ int main(int argc, char* argv[])
 
     constexpr unsigned HPad = 2;
     constexpr unsigned WPad = 2;
-#elif 1
+#elif 0
     // 1x1 filter, 32x32 image
     constexpr unsigned N  = 64;
     constexpr unsigned C  = 256;
@@ -578,6 +578,18 @@ int main(int argc, char* argv[])
     constexpr unsigned Y  = 1;
     constexpr unsigned X  = 1;
 
+    constexpr unsigned HPad = 0;
+    constexpr unsigned WPad = 0;
+#elif 1
+    // 1x1 filter, 14x14 image
+    constexpr unsigned N  = 128;
+    constexpr unsigned C  = 2048;
+    constexpr unsigned HI = 14;
+    constexpr unsigned WI = 14;
+    constexpr unsigned K  = 512;
+    constexpr unsigned Y  = 1;
+    constexpr unsigned X  = 1;
+
     constexpr unsigned HPad = 0;
     constexpr unsigned WPad = 0;
 #endif
@@ -594,8 +606,8 @@ int main(int argc, char* argv[])
     ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
     ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
 
-    using in_data_t  = char;
-    using out_data_t = int32_t;
+    using in_data_t  = float;
+    using out_data_t = float;
     Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
     Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
     Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
@@ -635,9 +647,9 @@ int main(int argc, char* argv[])
     device_direct_convolution_1
 #elif 0
     device_direct_convolution_2_nchw_kcyx_nkhw
-#elif 1
-    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
 #elif 0
+    device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
+#elif 1
     device_implicit_gemm_convolution_1_chwn_cyxk_khwn
 #elif 0
     device_implicit_gemm_convolution_2_chwn_cyxk_khwn
diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp
index 1caab6a4c9..89654cbc2b 100644
--- a/src/include/Array.hip.hpp
+++ b/src/include/Array.hip.hpp
@@ -10,7 +10,7 @@ struct Array
     unsigned mData[nSize];
 
     template <class... Xs>
-    __host__ __device__ Array(Xs... xs) : mData({static_cast<TData>(xs)...})
+    __host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
     {
     }
 
diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp
index fa5f36be51..0660c34ebb 100644
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded
         constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
 
         const Float* p_src_tmp =
-            p_src + src_desc.Get1dIndex(c_block_data_begin,
-                                        (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
-                                        (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
-                                        n_block_data_begin);
+            p_src +
+            src_desc.Get1dIndex(c_block_data_begin,
+                                (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
+                                (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
+                                n_block_data_begin);
 
 #if 0
         if(get_thread_local_1d_id() == 0)
diff --git a/src/include/blockwise_direct_convolution.hip.hpp b/src/include/blockwise_direct_convolution.hip.hpp
index 247ff219f6..7666607c9c 100644
--- a/src/include/blockwise_direct_convolution.hip.hpp
+++ b/src/include/blockwise_direct_convolution.hip.hpp
@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         Float p_out_thread[out_thread_desc.GetElementSpace()];
 
         threadwise_4d_tensor_copy(out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                   out_thread_desc,
                                   p_out_thread,
                                   out_thread_desc.GetLengths());
@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
             // threadwise convolution
             threadwise_direct_convolution_2(
                 in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data_begin,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
+                p_in_block +
+                    in_block_desc.Get1dIndex(n_thread_data_begin,
+                                             c_thread_data_begin,
+                                             hi_thread_data_begin,
+                                             wi_thread_data_begin),
                 wei_thread_block_desc,
                 p_wei_block +
                     wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         threadwise_4d_tensor_copy(out_thread_desc,
                                   p_out_thread,
                                   out_block_desc,
-                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                          k_thread_data_begin,
-                                                                          ho_thread_data_begin,
-                                                                          wo_thread_data_begin),
+                                  p_out_block +
+                                      out_block_desc.Get1dIndex(n_thread_data_begin,
+                                                                k_thread_data_begin,
+                                                                ho_thread_data_begin,
+                                                                wo_thread_data_begin),
                                   out_thread_desc.GetLengths());
     }
 }
diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 33556dde25..221a7153a2 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
         constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
         constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
 
-        // loop over k
+// loop over k
 #pragma unroll
         for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
         {
-            // read first batch of A, B
-            //   copy A-sub to form A
+// read first batch of A, B
+//   copy A-sub to form A
 #pragma unroll
             for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
             {
@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     a_thread_sub_mtx.GetLengths());
             }
 
-            //   copy B-sub to form B
+//   copy B-sub to form B
 #pragma unroll
             for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     b_thread_sub_mtx.GetLengths());
             }
 
-            // loop over batch
+// loop over batch
 #pragma unroll
             for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
             {
@@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
             c_thread_mtx_begin.batch * BlockMatrixStrideC +
             c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
 
-        for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat)
+        for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
         {
-            for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat)
+            for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
                 threadwise_matrix_copy(
                     c_thread_sub_mtx,
-                    p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
-                                                             n_repeat * NPerLevel1Cluster),
+                    p_c_thread +
+                        c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
+                                                    n_repeat * NPerLevel1Cluster),
                     c_block_mtx,
                     p_c_block +
                         c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC
             constexpr unsigned NClusterWork =
                 (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
 
-            static_assert(BlockSize == (MClusterWork * MThreadPerCluster) *
-                                           (NClusterWork * NThreadPerCluster),
+            static_assert(BlockSize ==
+                              (MClusterWork * MThreadPerCluster) *
+                                  (NClusterWork * NThreadPerCluster),
                           "wrong! wrong BlockSize");
 
             if(DistributeThreadAlongColumnFirst)
@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                             p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                             c_thread_sub_mtx,
                             False,
-                            p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
-                                                                 n_repeat * NPerThreadSubC),
+                            p_c_thread +
+                                c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
+                                                        n_repeat * NPerThreadSubC),
                             f_accum);
                     }
                 }
diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp
index d5832dde9d..ba0a521fb3 100644
--- a/src/include/common.hip.hpp
+++ b/src/include/common.hip.hpp
@@ -21,6 +21,7 @@ struct is_same<T, T>
     static const bool value = true;
 };
 
+#if 0
 template <typename T>
 __host__ __device__ constexpr T max(T a, T b)
 {
@@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b)
 {
     return a < b ? a : b;
 }
+#endif
 
 __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b)
 {
diff --git a/src/include/config.h.in b/src/include/config.h.in
index 7b888c6951..bb4f6cb51d 100644
--- a/src/include/config.h.in
+++ b/src/include/config.h.in
@@ -4,8 +4,10 @@
 
 #if DEVICE_BACKEND_HIP
 #include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
 #elif DEVICE_BACKEND_CUDA
 #include "cuda_runtime.h"
+#include "cuda_fp16.h"
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
 #endif
diff --git a/src/include/data_type.hip.hpp b/src/include/data_type.hip.hpp
index ca1f4dcbae..95d5b0b33f 100644
--- a/src/include/data_type.hip.hpp
+++ b/src/include/data_type.hip.hpp
@@ -1,15 +1,6 @@
 #pragma once
 #include "config.h"
 
-#if DEVICE_BACKEND_CUDA
-namespace CUDA {
-#include "cuda_fp16.h"
-}
-#endif
-
-using half  = CUDA::half;
-using half2 = CUDA::half2;
-
 template <class T, unsigned N>
 struct vector_type
 {
@@ -52,6 +43,7 @@ struct vector_type<float2, 2>
     using MemoryType = float4;
 };
 
+#if 0
 template <>
 struct vector_type<half, 1>
 {
@@ -91,24 +83,6 @@ struct vector_type<half, 8>
     using MemoryType = float4;
 };
 
-template <>
-struct vector_type<half2, 1>
-{
-    using MemoryType = half2;
-};
-
-template <>
-struct vector_type<half2, 2>
-{
-    using MemoryType = float2;
-};
-
-template <>
-struct vector_type<half2, 4>
-{
-    using MemoryType = float4;
-};
-
 template <>
 struct vector_type<char, 1>
 {
@@ -169,7 +143,6 @@ struct vector_type<int32_t, 2>
     using MemoryType = int64_t;
 };
 
-#if 0
 template <>
 struct vector_type<char2, 2>
 {
@@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa
     d += s0.w * s1.w;
 }
 
+#if 0
 __device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
 
 __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
@@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2&
     d += s0.y * s1.y;
 }
 
-#if 0
 __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
 {
     d += s0.x * s1.x + s0.y * s1.y;
 }
-#endif
 
 __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
 
@@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
     d = __dp4a(s0, s1, d);
 #endif
 }
+#endif
diff --git a/src/include/gridwise_direct_convolution_1.hip.hpp b/src/include/gridwise_direct_convolution_1.hip.hpp
index f4fe1809fc..edcfd6d38e 100644
--- a/src/include/gridwise_direct_convolution_1.hip.hpp
+++ b/src/include/gridwise_direct_convolution_1.hip.hpp
@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
         c_block_work_begin += CPerBlock)
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin,
-                                                                      c_block_work_begin,
-                                                                      hi_block_work_begin,
-                                                                      wi_block_work_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_global_desc.Get1dIndex(n_block_work_begin,
+                                                            c_block_work_begin,
+                                                            hi_block_work_begin,
+                                                            wi_block_work_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
     }
 
     // copy output tensor from LDS to device mem
-    blockwise_out_copy.Run(p_out_block,
-                           p_out_global + out_global_desc.Get1dIndex(n_block_work_begin,
-                                                                     k_block_work_begin,
-                                                                     ho_block_work_begin,
-                                                                     wo_block_work_begin));
+    blockwise_out_copy.Run(
+        p_out_block,
+        p_out_global +
+            out_global_desc.Get1dIndex(
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
 }
diff --git a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
index 5761a22c16..1e6d3d24bd 100644
--- a/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_nchw_kcyx_nkhw.hip.hpp
@@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin,
-                                                                           c_block_data_begin,
-                                                                           hi_block_data_begin,
-                                                                           wi_block_data_begin),
+        blockwise_in_copy.Run(p_in_global +
+                                  in_nchw_global_desc.Get1dIndex(n_block_data_begin,
+                                                                 c_block_data_begin,
+                                                                 hi_block_data_begin,
+                                                                 wi_block_data_begin),
                               p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex(
-                                                  k_block_data_begin, c_block_data_begin, 0, 0),
-                               p_wei_block);
+        blockwise_wei_copy.Run(
+            p_wei_global +
+                wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_block);
 
         __syncthreads();
 
@@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #if 1
             threadwise_direct_convolution_2(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block +
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                  c_thread_data,
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
                     wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
 #elif 0
             threadwise_direct_convolution_3(
                 in_nchw_thread_block_desc,
-                p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
-                                                           c_thread_data,
-                                                           hi_thread_data_begin,
-                                                           wi_thread_data_begin),
+                p_in_block +
+                    in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
+                                                  c_thread_data,
+                                                  hi_thread_data_begin,
+                                                  wi_thread_data_begin),
                 wei_kcyx_thread_block_desc,
                 p_wei_block +
                     wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
         out_nkhw_thread_desc,
         p_out_thread,
         out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+        p_out_global +
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
         out_nkhw_thread_desc.GetLengths());
 }
diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
index 7ae594788b..4d72368b29 100644
--- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
@@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
                               p_in_vec_block);
 
         // copy weight tensor to LDS
-        blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
-                                                      k_block_data_begin, c_block_data_begin, 0, 0),
-                               p_wei_vec_block);
+        blockwise_wei_copy.Run(
+            p_wei_vec_global +
+                wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            p_wei_vec_block);
 
         __syncthreads();
 
@@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #if 1
             threadwise_direct_convolution_2(
                 in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   c_thread_data,
-                                                                   hi_thread_data_begin,
-                                                                   wi_thread_data_begin),
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
 #elif 0
             threadwise_direct_convolution_3(
                 in_nchw_vec_thread_block_desc,
-                p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                   c_thread_data,
-                                                                   hi_thread_data_begin,
-                                                                   wi_thread_data_begin),
+                p_in_vec_block +
+                    in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                 wei_kcyx_vec_thread_block_desc,
                 p_wei_vec_block +
                     wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
@@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
         out_nkhw_thread_desc,
         p_out_thread,
         out_nkhw_global_desc,
-        p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                       k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin),
+        p_out_global +
+            out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                            k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin),
         out_nkhw_thread_desc.GetLengths());
 }
diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
index 1caef669e9..99342d3ca1 100644
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn.hip.hpp
@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
     threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
 
     const Float* p_in_global_block_begin =
-        p_in_global + in_chwn_global_desc.Get1dIndex(
-                          0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
+        p_in_global +
+        in_chwn_global_desc.Get1dIndex(
+            0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
 
     const Float* p_wei_global_block_begin =
         p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
         }
     }
 
-    // output: register to global mem,
+// output: register to global mem,
 #if 0
     const auto c_thread_mtx_begin =
         blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
         }
 #endif
 
-        threadwise_8d_tensor_copy(out_8d_thread_desc,
-                                  p_out_thread,
-                                  out_8d_global_desc,
-                                  p_out_global + out_khwn_global_desc.Get1dIndex(
-                                                     k_block_data_begin + k_thread_data_begin,
-                                                     ho_block_data_begin + ho_thread_data_begin,
-                                                     wo_block_data_begin + wo_thread_data_begin,
-                                                     n_block_data_begin + n_thread_data_begin),
-                                  out_8d_thread_desc.GetLengths(),
-                                  Number<OutThreadCopyDataPerWrite>{});
+        threadwise_8d_tensor_copy(
+            out_8d_thread_desc,
+            p_out_thread,
+            out_8d_global_desc,
+            p_out_global +
+                out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                                ho_block_data_begin + ho_thread_data_begin,
+                                                wo_block_data_begin + wo_thread_data_begin,
+                                                n_block_data_begin + n_thread_data_begin),
+            out_8d_thread_desc.GetLengths(),
+            Number<OutThreadCopyDataPerWrite>{});
     }
     else if(NPerThread == NPerBlock)
     {
diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
index a4904cdf58..790a006023 100644
--- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
         out_hkwn_thread_desc,
         p_out_thread,
         out_khwn_global_desc,
-        p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
-                                                       ho_block_data_begin + ho_thread_data_begin,
-                                                       wo_block_data_begin + wo_thread_data_begin,
-                                                       n_block_data_begin + n_thread_data_begin),
+        p_out_global +
+            out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
+                                            ho_block_data_begin + ho_thread_data_begin,
+                                            wo_block_data_begin + wo_thread_data_begin,
+                                            n_block_data_begin + n_thread_data_begin),
         out_hkwn_thread_desc.GetLengths(),
         reorder_khwn_from_hkwn);
 }
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
index afa3d3ee90..f68b57b6a0 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn.hip.hpp
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                                decltype(in_cb_block_desc),
                                decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
                                                           InBlockCopyThreadPerDim0,
                                                           InBlockCopyThreadPerDim1>{};
 #elif 1
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
diff --git a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
index 60d827293b..728a2d56ba 100644
--- a/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+++ b/src/include/gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                                decltype(in_cb_block_desc),
                                decltype(in_cb_block_desc.GetLengths())>{};
 #elif 0
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy2<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
                                                           InBlockCopyThreadPerDim0,
                                                           InBlockCopyThreadPerDim1>{};
 #elif 1
-    const auto blockwise_in_copy  = Blockwise2dTensorCopy3<BlockSize,
+    const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
                                                           Float,
                                                           decltype(in_cb_global_desc),
                                                           decltype(in_cb_block_desc),
diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp
index d0c785c16e..1ebfef0c5d 100644
--- a/src/include/tensor.hpp
+++ b/src/include/tensor.hpp
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
     return os;
 }
 
-typedef enum
-{
+typedef enum {
     Half  = 0,
     Float = 1,
 } DataType_t;
diff --git a/src/include/threadwise_nd_tensor_op.hip.hpp b/src/include/threadwise_nd_tensor_op.hip.hpp
index c787afae77..510525db5d 100644
--- a/src/include/threadwise_nd_tensor_op.hip.hpp
+++ b/src/include/threadwise_nd_tensor_op.hip.hpp
@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
                                           SrcOpLengths,
                                           Number<DataPerRead>)
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 &&
                       SrcOpLengths::nDim == 6,
@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
                                           SrcOpLengths,
                                           Number<DataPerRead>)
 {
-    using vector_t = typename vector_type<Float, DataPerRead>::type;
+    using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
 
     static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 &&
                       SrcOpLengths::nDim == 8,

From 18a81e356eef305b244db0e7c46772925a540a44 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Fri, 22 Mar 2019 16:33:04 -0500
Subject: [PATCH 13/13] adding assembly

---
 src/include/blockwise_gemm.hip.hpp | 113 ++++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 11 deletions(-)

diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp
index 221a7153a2..f80c49a029 100644
--- a/src/include/blockwise_gemm.hip.hpp
+++ b/src/include/blockwise_gemm.hip.hpp
@@ -435,11 +435,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
 #pragma unroll
         for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
         {
-// read first batch of A, B
-//   copy A-sub to form A
-#pragma unroll
+            // read first batch of A, B
+            //   copy A-sub to form A
+            //#pragma unroll
             for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
             {
+#if 0
                 threadwise_matrix_copy(
                     a_block_mtx,
                     p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) +
@@ -447,12 +448,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     a_thread_mtx,
                     p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                     a_thread_sub_mtx.GetLengths());
+#else
+                for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
+                    {
+                        p_a_thread[a_thread_mtx.Get1dIndex(i, m_repeat * MPerThreadSubC + j)] =
+                            p_a_block[a_block_mtx.Get1dIndex(k_begin + i,
+                                                             m_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetA];
+                    }
+                }
+#endif
             }
 
-//   copy B-sub to form B
-#pragma unroll
+            //   copy B-sub to form B
+            //#pragma unroll
             for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
             {
+#if 0
                 threadwise_matrix_copy(
                     b_block_mtx,
                     p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) +
@@ -460,13 +474,26 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                     b_thread_mtx,
                     p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                     b_thread_sub_mtx.GetLengths());
+#else
+                for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
+                    {
+                        p_b_thread[b_thread_mtx.Get1dIndex(i, n_repeat * NPerThreadSubC + j)] =
+                            p_b_block[b_block_mtx.Get1dIndex(k_begin + i,
+                                                             n_repeat * MPerLevel1Cluster + j) +
+                                      mMyThreadOffsetB];
+                    }
+                }
+#endif
             }
 
-// loop over batch
-#pragma unroll
+            // loop over batch
+            //#pragma unroll
             for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
             {
-                // do current batch of gemm
+// do current batch of gemm
+#if 0
                 threadwise_gemm(a_thread_mtx,
                                 True,
                                 p_a_thread,
@@ -477,13 +504,32 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                                 False,
                                 p_c_thread + ib * ThreadMatrixStrideC,
                                 f_accum);
+#else
+                for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
+                {
+                    for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
+                    {
+                        for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
+                        {
+                            const unsigned aindex =
+                                a_thread_mtx.Get1dIndex(k, i); // A is transposed
+                            const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
+                            const unsigned cindex =
+                                c_thread_mtx.Get1dIndex(i, j) + ib * ThreadMatrixStrideC;
+
+                            f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
+                        }
+                    }
+                }
+#endif
 
                 // read next batch of a, b
                 if(BlockMatrixStrideA != 0)
                 {
-#pragma unroll
+                    //#pragma unroll
                     for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
                     {
+#if 0
                         threadwise_matrix_copy(
                             a_block_mtx,
                             p_a_block +
@@ -492,14 +538,28 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                             a_thread_mtx,
                             p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC),
                             a_thread_sub_mtx.GetLengths());
+#else
+                        for(unsigned i = 0; i < a_thread_mtx.NRow(); ++i)
+                        {
+                            for(unsigned j = 0; j < a_thread_mtx.NCol(); ++j)
+                            {
+                                p_a_thread[a_thread_mtx.Get1dIndex(i,
+                                                                   m_repeat * MPerThreadSubC + j)] =
+                                    p_a_block[a_block_mtx.Get1dIndex(
+                                                  k_begin + i, m_repeat * MPerLevel1Cluster + j) +
+                                              (ib + 1) * BlockMatrixStrideA + mMyThreadOffsetA];
+                            }
+                        }
+#endif
                     }
                 }
 
                 if(BlockMatrixStrideB != 0)
                 {
-#pragma unroll
+                    //#pragma unroll
                     for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
                     {
+#if 0
                         threadwise_matrix_copy(
                             b_block_mtx,
                             p_b_block +
@@ -508,11 +568,25 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                             b_thread_mtx,
                             p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
                             b_thread_sub_mtx.GetLengths());
+#else
+                        for(unsigned i = 0; i < b_thread_mtx.NRow(); ++i)
+                        {
+                            for(unsigned j = 0; j < b_thread_mtx.NCol(); ++j)
+                            {
+                                p_b_thread[b_thread_mtx.Get1dIndex(i,
+                                                                   n_repeat * NPerThreadSubC + j)] =
+                                    p_b_block[b_block_mtx.Get1dIndex(
+                                                  k_begin + i, n_repeat * MPerLevel1Cluster + j) +
+                                              (ib + 1) * BlockMatrixStrideB + mMyThreadOffsetB];
+                            }
+                        }
+#endif
                     }
                 }
             }
 
-            // do last batch of gemm
+// do last batch of gemm
+#if 0
             threadwise_gemm(a_thread_mtx,
                             True,
                             p_a_thread,
@@ -523,6 +597,23 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
                             False,
                             p_c_thread + (BatchPerThread - 1) * ThreadMatrixStrideC,
                             f_accum);
+#else
+            for(unsigned k = 0; k < a_thread_mtx.NRow(); ++k)
+            {
+                for(unsigned i = 0; i < c_thread_mtx.NRow(); ++i)
+                {
+                    for(unsigned j = 0; j < c_thread_mtx.NCol(); ++j)
+                    {
+                        const unsigned aindex = a_thread_mtx.Get1dIndex(k, i); // A is transposed
+                        const unsigned bindex = b_thread_mtx.Get1dIndex(k, j);
+                        const unsigned cindex =
+                            c_thread_mtx.Get1dIndex(i, j) + (BatchPerThread - 1)  * ThreadMatrixStrideC;
+
+                        f_accum(p_c_thread[cindex], p_a_thread[aindex] * p_b_thread[bindex]);
+                    }
+                }
+            }
+#endif
         }
     }