reorginzed files

[ROCm/composable_kernel commit: 1566b31736]
2026-05-16 02:54:21 +00:00 · 2019-06-13 15:12:12 -05:00
parent 11c6b2ab9a
commit 5f217ebda5
64 changed files with 254 additions and 218 deletions
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -0,0 +1,125 @@
+#ifndef CK_CONV_COMMON_HPP
+#define CK_CONV_COMMON_HPP
+
+#include "ConstantTensorDescriptor.hpp"
+
+using namespace ck;
+
+// this is ugly, only for 4d
+template <class InDesc, class WeiDesc>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr auto WI = in_desc.GetLength(I3);
+
+    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr auto Y = wei_desc.GetLength(I2);
+    constexpr auto X = wei_desc.GetLength(I3);
+
+    constexpr auto HO = HI + 1 - Y;
+    constexpr auto WO = WI + 1 - X;
+
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
+}
+
+template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
+constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
+                                                                                WeiDesc,
+                                                                                LowerPads,
+                                                                                UpperPads)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr auto N  = in_desc.GetLength(I0);
+    constexpr auto HI = in_desc.GetLength(I2);
+    constexpr auto WI = in_desc.GetLength(I3);
+
+    constexpr auto K = wei_desc.GetLength(I0);
+    constexpr auto Y = wei_desc.GetLength(I2);
+    constexpr auto X = wei_desc.GetLength(I3);
+
+    constexpr auto HPadLow = LowerPads{}.Get(I0);
+    constexpr auto WPadLow = LowerPads{}.Get(I1);
+
+    constexpr auto HPadUp = UpperPads{}.Get(I0);
+    constexpr auto WPadUp = UpperPads{}.Get(I1);
+
+    constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
+    constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;
+
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
+}
+
+template <class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
+{
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr index_t N  = out_desc.GetLength(I0);
+    constexpr index_t K  = out_desc.GetLength(I1);
+    constexpr index_t Ho = out_desc.GetLength(I2);
+    constexpr index_t Wo = out_desc.GetLength(I3);
+
+    constexpr index_t C = wei_desc.GetLength(I1);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+
+    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+}
+
+template <class Float, class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, OutDesc)
+{
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr index_t N  = out_desc.GetLength(I0);
+    constexpr index_t K  = out_desc.GetLength(I1);
+    constexpr index_t Ho = out_desc.GetLength(I2);
+    constexpr index_t Wo = out_desc.GetLength(I3);
+
+    constexpr index_t C = wei_desc.GetLength(I1);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+
+    return sizeof(Float) *
+           (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
+}
+
+#endif
--- a/driver/include/device.hpp
+++ b/driver/include/device.hpp
@@ -0,0 +1,64 @@
+#ifndef CK_DEVICE_HPP
+#define CK_DEVICE_HPP
+
+#include <memory>
+#include "config.hpp"
+
+using namespace ck;
+
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+struct KernelTimerImpl;
+
+struct KernelTimer
+{
+    KernelTimer();
+    ~KernelTimer();
+    void Start();
+    void End();
+    float GetElapsedTime() const;
+
+    std::unique_ptr<KernelTimerImpl> impl;
+};
+
+template <typename... Args, typename F>
+float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
+{
+    KernelTimer timer;
+
+#if CK_DEVICE_BACKEND_AMD
+    timer.Start();
+
+    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
+
+    timer.End();
+
+    hipGetErrorString(hipGetLastError());
+#elif CK_DEVICE_BACKEND_NVIDIA
+    const void* f  = reinterpret_cast<const void*>(kernel);
+    void* p_args[] = {&args...};
+
+    timer.Start();
+
+    cudaError_t error = cudaLaunchKernel(f, grid_dim, block_dim, p_args, lds_byte, 0);
+
+    timer.End();
+
+    checkCudaErrors(error);
+#endif
+
+    return timer.GetElapsedTime();
+}
+
+#endif
--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,98 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
+                                                 const Tensor<T>& in,
+                                                 WeiDesc,
+                                                 const Tensor<T>& wei,
+                                                 OutDesc,
+                                                 Tensor<T>& out,
+                                                 index_t nrepeat)
+{
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
+
+    int num_thread = std::thread::hardware_concurrency();
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    out_device_buf.ToDevice(out.mData.data());
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+#if 1
+    // 3x3, 34x34, 128 thread
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 4;
+    constexpr index_t CPerThread  = 2;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopyDataPerRead  = 1;
+    constexpr index_t WeiBlockCopyDataPerRead = 1;
+
+    constexpr index_t BlockSize = 128;
+#endif
+
+    constexpr index_t GridSize =
+        (out_desc.GetLength(I0) / NPerBlock) * (out_desc.GetLength(I1) / KPerBlock) *
+        (out_desc.GetLength(I2) / HoPerBlock) * (out_desc.GetLength(I3) / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        using gridwise_conv = GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw<GridSize,
+                                                                          BlockSize,
+                                                                          T,
+                                                                          InDesc,
+                                                                          WeiDesc,
+                                                                          OutDesc,
+                                                                          NPerBlock,
+                                                                          KPerBlock,
+                                                                          CPerBlock,
+                                                                          HoPerBlock,
+                                                                          WoPerBlock,
+                                                                          NPerThread,
+                                                                          KPerThread,
+                                                                          CPerThread,
+                                                                          HoPerThread,
+                                                                          WoPerThread,
+                                                                          InBlockCopyDataPerRead,
+                                                                          WeiBlockCopyDataPerRead>;
+        float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_device_buf.FromDevice(out.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -0,0 +1,539 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // reorder weight
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
+        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
+        std::thread::hardware_concurrency());
+
+    // reorder input
+    auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
+    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
+
+    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+
+    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
+        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_nchw2chwn, N, C, Hi, Wi)(
+        std::thread::hardware_concurrency());
+
+    // output
+    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
+    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
+
+    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
+
+    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
+
+#if 0
+    // for 3x3, 34x34, v1r1, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 4, 2, 4>;
+    constexpr index_t InBlockCopyDataPerRead_N = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 0
+    // for 3x3, 34x34, v1r2, Pascal, in-block-copy1
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 4;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 8;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<0, 0, 0, 0>; // not used
+    constexpr index_t InBlockCopyDataPerRead_N = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 0
+    // for 3x3, 34x34, v1r3, Pascal
+    // for 3x3, 28x28, v1r3, Pascal
+    // for 3x3, 14x14, v1r3, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<8, 2, 2, 4>;
+    constexpr index_t InBlockCopyDataPerRead_N = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 0
+    // for 3x3, 34x34, v1r3, Pascal, bad
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 1;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 1;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<2, 2, 32, 1>;
+    constexpr index_t InBlockCopyDataPerRead_N = 1;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 2;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 1;
+#elif 0
+    // for 3x3, 34x34, v1r1, Vega 20
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 4, 2, 8>;
+    constexpr index_t InBlockCopyDataPerRead_N = 2;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 2;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
+#elif 1
+    // for 3x3, 34x34, v1r3, Vega 20
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<8, 2, 4, 4>;
+    constexpr index_t InBlockCopyDataPerRead_N = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
+#elif 0
+    // for 3x3, 56x56, v1r1, Pascal
+    constexpr index_t NPerBlock  = 32;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 8;
+    constexpr index_t InBlockCopyDataPerRead_N  = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 3x3, 56x56, v1r2, Pascal
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 1;
+    constexpr index_t GemmDataPerReadB   = 1;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
+    constexpr index_t InBlockCopyDataPerRead_N  = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K   = 4;
+    constexpr index_t OutThreadCopyDataPerWrite_N = 4;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 3x3, 28x28, v1r1, Pacal
+    constexpr index_t NPerBlock  = 32;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 1;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 4;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 8;
+    constexpr index_t InBlockCopyDataPerRead_N  = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 3x3, 28x28, v1r2, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopyClusterLengths_CHWN       = Sequence<4, 2, 4, 4>;
+    constexpr index_t InBlockCopyDataPerRead_N = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+#elif 0
+    // for 1x1, 28x28, v1r1, Pascal
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 8;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
+    constexpr index_t InBlockCopyDataPerRead_N  = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 1x1, 14x14, v1r1, Pascal
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 8;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t InBlockCopy_ThreadPerDimC = 8;
+    constexpr index_t InBlockCopy_ThreadPerDimH = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimW = 2;
+    constexpr index_t InBlockCopy_ThreadPerDimN = 4;
+    constexpr index_t InBlockCopyDataPerRead_N  = 4;
+
+    constexpr index_t WeiBlockCopyDataPerRead_K   = 4;
+    constexpr index_t OutThreadCopyDataPerWrite_N = 2;
+
+    constexpr index_t BlockSize = 128;
+#endif
+
+    constexpr index_t GridSize =
+        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
+        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
+#elif 0
+            GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
+#elif 0
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
+#elif 1
+            GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_chwn_desc),
+             decltype(wei_cyxk_desc),
+             decltype(out_khwn_desc),
+             NPerBlock,
+             KPerBlock,
+             CPerBlock,
+             HoPerBlock,
+             WoPerBlock,
+             NPerThread,
+             KPerThread,
+             HoPerThread,
+             WoPerThread,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopyClusterLengths_CHWN,
+             InBlockCopyDataPerRead_N,
+             WeiBlockCopyDataPerRead_K,
+             OutThreadCopyDataPerWrite_N>{};
+
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
+
+    // reorder output
+    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
+        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
+        std::thread::hardware_concurrency());
+}
--- a/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -0,0 +1,373 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // reorder weight
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
+        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
+        std::thread::hardware_concurrency());
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 0
+    // for 3x3, 34x34, v1r3, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 16;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 1, 16>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load input for NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 1;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#elif 0
+    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 1;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 1;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<1, 2, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 4, 2, 32>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 1;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 4;
+#elif 1
+    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 16
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 16;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 4;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<2, 1, 2, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 2, 16>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 2;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#elif 0
+    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 4;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 8;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<1, 8, 4, 8>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
+#elif 0
+    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 4
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 8;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<2, 8, 4, 4>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
+#elif 0
+    // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 2
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t NPerBlock  = 32;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<8, 8, 2, 2>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 1;
+#elif 1
+    // for 3x3, 28x28, v1r3, Pascal
+    constexpr index_t BlockSize = 128;
+
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 2;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockReorderSrcSubLengths_NCHW                    = Sequence<4, 1, 1, 1>;
+    using InBlockReorderSrcClusterLengths_NCHW                = Sequence<4, 8, 2, 2>;
+    using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
+    constexpr index_t InBlockReorderDataPerRead_W  = 1; // v1r3 cannot do vector load NCHW
+    constexpr index_t InBlockReorderDataPerWrite_N = 4;
+
+    using WeiBlockCopyClusterLengths            = void;
+    constexpr index_t WeiBlockCopyDataPerRead_K = 4;
+
+    constexpr index_t OutThreadCopyDataPerWrite_W = 2;
+#endif
+
+    constexpr index_t GridSize =
+        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
+        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
+#else
+            GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_nchw_desc),
+             decltype(wei_cyxk_desc),
+             decltype(out_nkhw_desc),
+             NPerBlock,
+             KPerBlock,
+             CPerBlock,
+             HoPerBlock,
+             WoPerBlock,
+             NPerThread,
+             KPerThread,
+             HoPerThread,
+             WoPerThread,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockReorderSrcSubLengths_NCHW,
+             InBlockReorderSrcClusterLengths_NCHW,
+             InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
+             InBlockReorderDataPerRead_W,
+             InBlockReorderDataPerWrite_N,
+             WeiBlockCopyClusterLengths,
+             WeiBlockCopyDataPerRead_K,
+             OutThreadCopyDataPerWrite_W>{};
+
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -0,0 +1,333 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
+#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t N  = in_nchw_desc.GetLength(I0);
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1);
+
+    // convert in_nchw to in_cnhw
+    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
+    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
+
+    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+
+    make_ParallelTensorFunctor(
+        [&](auto n, auto c, auto hi, auto wi) { in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); },
+        N,
+        C,
+        Hi,
+        Wi)(std::thread::hardware_concurrency());
+
+    // convert wei_kcyx to wei_cyxk
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    make_ParallelTensorFunctor(
+        [&](auto k, auto c, auto y, auto x) { wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); },
+        K,
+        C,
+        Y,
+        X)(std::thread::hardware_concurrency());
+
+    // conver out_nkhw to out_knhw
+    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
+
+    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+
+#if 0
+    // 3x3, 34x34
+    // need to use register double buffer for GEMM
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 64;
+    constexpr index_t CPerBlock = 4;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 8;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead  = 4;
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+    constexpr index_t OutThreadCopyDataPerWrite = 4;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 1x1, 28x28, 64 threads
+    constexpr index_t BPerBlock = 64;
+    constexpr index_t KPerBlock = 64;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 2;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t GemmThreadPerColumnPerCluster = 8;
+    constexpr index_t GemmThreadPerRowPerCluster    = 8;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead  = 4;
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+
+    constexpr index_t BlockSize = 64;
+#elif 0
+    // 1x1, 28x28, 128 threads, no lds-double-buffer
+    // 1x1, 28x28, 128 threads, with lds-double-buffer, max_register = 128
+    constexpr index_t BPerBlock = 64;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t GemmThreadPerColumnPerCluster = 8;
+    constexpr index_t GemmThreadPerRowPerCluster    = 8;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead  = 4;
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 1x1, 28x28, 256 thread
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+
+    constexpr index_t GemmThreadPerColumnPerCluster = 8;
+    constexpr index_t GemmThreadPerRowPerCluster    = 8;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead  = 4;
+    constexpr index_t WeiBlockCopyDataPerRead = 4;
+
+    constexpr index_t BlockSize = 256;
+#elif 0
+    // 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer
+    constexpr index_t BPerBlock = 64;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 2;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead    = 4;
+    constexpr index_t WeiBlockCopyDataPerRead   = 4;
+    constexpr index_t OutThreadCopyDataPerWrite = 4;
+
+    constexpr index_t BlockSize = 128;
+#elif 1
+    // 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t BPerThread = 8;
+    constexpr index_t KPerThread = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    constexpr index_t InBlockCopyThreadPerDim0 = 4;
+    constexpr index_t InBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 16;
+
+    constexpr index_t InBlockCopyDataPerRead    = 4;
+    constexpr index_t WeiBlockCopyDataPerRead   = 4;
+    constexpr index_t OutThreadCopyDataPerWrite = 4;
+
+    constexpr index_t BlockSize = 256;
+#endif
+
+    constexpr index_t GridSize =
+        ((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    // mem
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_chwn_device_buf(data_sz * (in_chwn.mDesc.GetElementSpace() + BGhostRead +
+                                            BPerBlock)); // reserve extra space for BGhostRead
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
+
+    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
+#else
+            GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_chwn_desc),
+             decltype(wei_cyxk_desc),
+             decltype(out_khwn_desc),
+             BPerBlock,
+             KPerBlock,
+             CPerBlock,
+             BPerThread,
+             KPerThread,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopyThreadPerDim0,
+             InBlockCopyThreadPerDim1,
+             WeiBlockCopyThreadPerDim0,
+             WeiBlockCopyThreadPerDim1,
+             InBlockCopyDataPerRead,
+             WeiBlockCopyDataPerRead,
+             OutThreadCopyDataPerWrite>{};
+
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
+
+    // convert out_khwn to out_nkhw
+    make_ParallelTensorFunctor(
+        [&](auto n, auto k, auto ho, auto wo) { out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n); },
+        N,
+        K,
+        Ho,
+        Wo)(std::thread::hardware_concurrency());
+}
--- a/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -0,0 +1,156 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // reorder weight
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
+        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
+        std::thread::hardware_concurrency());
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+    constexpr index_t N1 = 2;
+    constexpr index_t N2 = 4;
+
+    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
+
+#if 1
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_C_N1_B_N2     = Sequence<1, 1, 1, 4>;
+    using InBlockCopyClusterLengths_C_N1_B_N2 = Sequence<8, 2, 16, 1>;
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_C_K     = Sequence<1, 4>;
+    using WeiBlockCopyClusterLengths_C_K = Sequence<8, 32>;
+
+    constexpr index_t WeiBlockCopyDataPerAccess_K = 4;
+#endif
+
+    constexpr index_t GridSize =
+        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
+#else
+            GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_nchw_desc),
+             decltype(wei_cyxk_desc),
+             decltype(out_nkhw_desc),
+             BPerBlock,
+             KPerBlock,
+             CPerBlock,
+             N1,
+             N2,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopySubLengths_C_N1_B_N2,
+             InBlockCopyClusterLengths_C_N1_B_N2,
+             InBlockCopySrcDataPerRead_B,
+             InBlockCopyDstDataPerWrite_N2,
+             WeiBlockCopySubLengths_C_K,
+             WeiBlockCopyClusterLengths_C_K,
+             WeiBlockCopyDataPerAccess_K>{};
+
+#if 1
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+#endif
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,155 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
+#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
+                                                        const Tensor<T>& in_nchw,
+                                                        WeiDesc,
+                                                        const Tensor<T>& wei_kcyx,
+                                                        OutDesc,
+                                                        Tensor<T>& out_nkhw,
+                                                        index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+    constexpr index_t N1 = 2;
+    constexpr index_t N2 = 4;
+
+    constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
+
+#if 1
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t CPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 1, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<8, 2, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#endif
+
+    constexpr index_t GridSize =
+        ((B + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        constexpr auto gridwise_conv =
+#if 0
+            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
+#else
+            GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
+#endif
+            <GridSize,
+             BlockSize,
+             T,
+             decltype(in_nchw_desc),
+             decltype(wei_kcyx_desc),
+             decltype(out_nkhw_desc),
+             BPerBlock,
+             KPerBlock,
+             CPerBlock,
+             N1,
+             N2,
+             GemmMPerThreadSubC,
+             GemmNPerThreadSubC,
+             GemmMLevel0Cluster,
+             GemmNLevel0Cluster,
+             GemmMLevel1Cluster,
+             GemmNLevel1Cluster,
+             GemmKPerThreadLoop,
+             GemmDataPerReadA,
+             GemmDataPerReadB,
+             InBlockCopySubLengths_E_N1_B_N2,
+             InBlockCopyClusterLengths_E_N1_B_N2,
+             InBlockCopyThreadClusterArrangeOrder,
+             InBlockCopySrcAccessOrder,
+             InBlockCopyDstAccessOrder,
+             InBlockCopySrcDataPerRead_B,
+             InBlockCopyDstDataPerWrite_N2,
+             WeiBlockCopySubLengths_E_K,
+             WeiBlockCopyClusterLengths_E_K,
+             WeiBlockCopyThreadClusterArrangeOrder,
+             WeiBlockCopySrcAccessOrder,
+             WeiBlockCopyDstAccessOrder,
+             WeiBlockCopySrcDataPerRead_E,
+             WeiBlockCopyDstDataPerWrite_K>{};
+
+        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,214 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
+
+using namespace ck;
+
+template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
+void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
+                                                           const Tensor<TInWei>& in_nchw,
+                                                           WeiDesc,
+                                                           const Tensor<TInWei>& wei_kcyx,
+                                                           OutDesc,
+                                                           Tensor<TOut>& out_nkhw,
+                                                           index_t nrepeat)
+{
+    // this suppose in / wei data type is int8x4
+    constexpr index_t NVector = 4;
+    using accum_t             = int32_t;
+    using vector_t            = vector_type<TInWei, NVector>;
+    using vector_mem_t        = typename vector_t::MemoryType;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // vectorized input
+    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
+    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
+
+    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
+
+    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
+#if 0
+        in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
+#elif 0
+        in_nchw_vec(n, c, h, w) =
+            vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
+#elif 1
+        in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
+                                                 in_nchw(n, 4 * c + 1, h, w),
+                                                 in_nchw(n, 4 * c + 2, h, w),
+                                                 in_nchw(n, 4 * c + 3, h, w));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
+        std::thread::hardware_concurrency());
+
+    // vectorize weight
+    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
+    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
+
+    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
+
+    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
+#if 0
+        wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
+#elif 0
+        wei_kcyx_vec(k, c, y, x) =
+            vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
+#elif 1
+        wei_kcyx_vec(k, c, y, x) = vector_t::Pack(wei_kcyx(k, 4 * c, y, x),
+                                                  wei_kcyx(k, 4 * c + 1, y, x),
+                                                  wei_kcyx(k, 4 * c + 2, y, x),
+                                                  wei_kcyx(k, 4 * c + 3, y, x));
+#endif
+    };
+
+    make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
+        std::thread::hardware_concurrency());
+
+    //
+    DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(sizeof(TOut) * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
+    wei_kcyx_vec_device_buf.ToDevice(wei_kcyx_vec.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 0
+    // 3x3, 34x34, 128 thread, fp32, vector = 1
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 4;
+    constexpr index_t CPerThread  = 2;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopyDataPerRead  = 2;
+    constexpr index_t WeiBlockCopyDataPerRead = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 3x3, 34x34, 128 thread, fp32, vector = 2
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 2;
+    constexpr index_t KPerThread  = 4;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 2;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopyDataPerRead  = 2;
+    constexpr index_t WeiBlockCopyDataPerRead = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 3x3, 34x34, 128 thread, int8, vector = 4
+    constexpr index_t NPerBlock  = 2;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 1;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t CPerThread  = 2;
+    constexpr index_t HoPerThread = 4;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopyDataPerRead  = 2;
+    constexpr index_t WeiBlockCopyDataPerRead = 2;
+
+    constexpr index_t BlockSize = 128;
+#elif 1
+    // 1x1, 32x32, 128 thread, int8, vector = 4
+    constexpr index_t NPerBlock  = 1;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 16;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 32;
+
+    constexpr index_t NPerThread  = 1;
+    constexpr index_t KPerThread  = 8;
+    constexpr index_t CPerThread  = 2;
+    constexpr index_t HoPerThread = 4;
+    constexpr index_t WoPerThread = 2;
+
+    constexpr index_t InBlockCopyDataPerRead  = 2;
+    constexpr index_t WeiBlockCopyDataPerRead = 2;
+
+    constexpr index_t BlockSize = 128;
+#endif
+
+    constexpr index_t GridSize =
+        (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(
+            gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw<TInWei,
+                                                                    TOut,
+                                                                    accum_t,
+                                                                    decltype(in_nchw_vec_desc),
+                                                                    decltype(wei_kcyx_vec_desc),
+                                                                    decltype(out_nkhw_desc),
+                                                                    NVector,
+                                                                    NPerBlock,
+                                                                    KPerBlock,
+                                                                    CPerBlock,
+                                                                    HoPerBlock,
+                                                                    WoPerBlock,
+                                                                    NPerThread,
+                                                                    KPerThread,
+                                                                    CPerThread,
+                                                                    HoPerThread,
+                                                                    WoPerThread,
+                                                                    InBlockCopyDataPerRead,
+                                                                    WeiBlockCopyDataPerRead,
+                                                                    BlockSize,
+                                                                    GridSize>,
+            dim3(GridSize),
+            dim3(BlockSize),
+            static_cast<TInWei*>(in_nchw_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(wei_kcyx_vec_device_buf.GetDeviceBuffer()),
+            static_cast<TInWei*>(out_nkhw_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -0,0 +1,296 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
+
+using namespace ck;
+
+template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
+void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
+                                                              const Tensor<T>& in_nchw,
+                                                              WeiDesc,
+                                                              const Tensor<T>& wei_kcyx,
+                                                              OutDesc,
+                                                              Tensor<T>& out_nkhw,
+                                                              LowerPads,
+                                                              UpperPads,
+                                                              index_t nrepeat)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto wei_kcyx_desc = WeiDesc{};
+    constexpr auto out_nkhw_desc = OutDesc{};
+
+    constexpr index_t Hi = in_nchw_desc.GetLength(I2);
+    constexpr index_t Wi = in_nchw_desc.GetLength(I3);
+
+    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
+    constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
+    constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
+
+    constexpr index_t K = wei_kcyx_desc.GetLength(I0);
+    constexpr index_t C = wei_kcyx_desc.GetLength(I1);
+    constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
+    constexpr index_t X = wei_kcyx_desc.GetLength(I3);
+
+    // reorder weight
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
+    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
+
+    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+
+    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
+        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
+        std::thread::hardware_concurrency());
+
+    // reorder input
+    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
+    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
+
+    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+
+    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
+        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_nchw2chwn, N, C, Hi, Wi)(
+        std::thread::hardware_concurrency());
+
+    // output
+    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
+    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
+
+    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
+    DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
+    DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
+
+    in_chwn_device_buf.ToDevice(in_chwn.mData.data());
+    wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
+    out_khwn_device_buf.ToDevice(out_khwn.mData.data());
+
+#if 0
+    constexpr index_t NPerBlock  = 1;
+    constexpr index_t KPerBlock  = 1;
+    constexpr index_t CPerBlock  = 1;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 1;
+    constexpr index_t KPerThread  = 1;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 1;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 1;
+
+    constexpr index_t BlockSize = 8;
+#elif 1
+    // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 32;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 3x3 58x58, NKC = 16,256,128
+    constexpr index_t NPerBlock  = 8;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 5x5, 36x36
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 7x7, 38x38
+    constexpr index_t NPerBlock  = 8;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 3x3, 56x56
+    constexpr index_t NPerBlock  = 32;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 4;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 1
+    // 3x3 56x56, NKC = 16,256,128, with padding
+    // 3x3 28x28, NKC = 16,512,256, with padding
+    // 3x3 20x84, NKC = 16,256,256, with padding
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 2;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 64;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 5x5 filter, 20x84 image, 1x1 padding
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 64;
+    constexpr index_t CPerBlock  = 1;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // 5x5 filter, 28x28 image, 2x2 padding
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 32;
+    constexpr index_t CPerBlock  = 2;
+    constexpr index_t HoPerBlock = 4;
+    constexpr index_t WoPerBlock = 4;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 1;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t BlockSize = 128;
+#elif 0
+    // for 1x1, 28x28
+    constexpr index_t NPerBlock  = 16;
+    constexpr index_t KPerBlock  = 128;
+    constexpr index_t CPerBlock  = 8;
+    constexpr index_t HoPerBlock = 2;
+    constexpr index_t WoPerBlock = 2;
+
+    constexpr index_t NPerThread  = 4;
+    constexpr index_t KPerThread  = 16;
+    constexpr index_t CPerThread  = 2;
+    constexpr index_t HoPerThread = 1;
+    constexpr index_t WoPerThread = 1;
+
+    constexpr index_t WeiBlockCopyThreadPerDim0 = 4;
+    constexpr index_t WeiBlockCopyThreadPerDim1 = 32;
+
+    constexpr index_t BlockSize = 128;
+#endif
+
+    constexpr index_t GridSize =
+        ((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
+        ((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(
+            gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded<GridSize,
+                                                                       BlockSize,
+                                                                       T,
+                                                                       decltype(in_chwn_desc),
+                                                                       decltype(wei_cyxk_desc),
+                                                                       decltype(out_khwn_desc),
+                                                                       LowerPads,
+                                                                       UpperPads,
+                                                                       NPerBlock,
+                                                                       KPerBlock,
+                                                                       CPerBlock,
+                                                                       HoPerBlock,
+                                                                       WoPerBlock,
+                                                                       NPerThread,
+                                                                       KPerThread,
+                                                                       CPerThread,
+                                                                       HoPerThread,
+                                                                       WoPerThread,
+                                                                       WeiBlockCopyThreadPerDim0,
+                                                                       WeiBlockCopyThreadPerDim1>,
+            dim3(GridSize),
+            dim3(BlockSize),
+
+            static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
+            static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    out_khwn_device_buf.FromDevice(out_khwn.mData.data());
+
+    // reorder output
+    auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
+        out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
+    };
+
+    make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
+        std::thread::hardware_concurrency());
+}
--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -0,0 +1,272 @@
+#ifndef CK_TENSOR_HPP
+#define CK_TENSOR_HPP
+
+#include <thread>
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <utility>
+#include <cassert>
+#include <iostream>
+
+template <class Range>
+std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << v;
+    }
+    return os;
+}
+
+typedef enum {
+    Half  = 0,
+    Float = 1,
+} DataType_t;
+
+template <class T>
+struct DataType;
+
+template <>
+struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
+{
+};
+
+template <class F, class T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <class F, class T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <class F, class T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>::value;
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+struct TensorDescriptor
+{
+    TensorDescriptor() = delete;
+    TensorDescriptor(std::initializer_list<std::size_t> lens);
+    TensorDescriptor(std::initializer_list<std::size_t> lens,
+                     std::initializer_list<std::size_t> strides);
+    TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
+
+    void CalculateStrides();
+
+    template <class Range>
+    TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <class Range1, class Range2>
+    TensorDescriptor(const Range1& lens, const Range2& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t GetNumOfDimension() const;
+    std::size_t GetElementSize() const;
+    std::size_t GetElementSpace() const;
+
+    const std::vector<std::size_t>& GetLengths() const;
+    const std::vector<std::size_t>& GetStrides() const;
+
+    template <class... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->GetNumOfDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    private:
+    std::vector<std::size_t> mLens;
+    std::vector<std::size_t> mStrides;
+};
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <class F, class... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(int idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [=] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(mF, GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <class F, class... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <class T>
+struct Tensor
+{
+    template <class X>
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <class X>
+    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    template <class X, class Y>
+    Tensor(std::vector<X> lens, std::vector<Y> strides)
+        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
+    {
+    }
+
+    Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
+
+    template <class G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetNumOfDimension())
+        {
+        case 1:
+        {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2:
+        {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3:
+        {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4:
+        {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <class... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <class... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    typename std::vector<T>::iterator begin() { return mData.begin(); }
+
+    typename std::vector<T>::iterator end() { return mData.end(); }
+
+    typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
+
+    typename std::vector<T>::const_iterator end() const { return mData.end(); }
+
+    TensorDescriptor mDesc;
+    std::vector<T> mData;
+};
+
+#endif